summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 18:54:28 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 18:54:28 -0800
commit818099574b04c5301eacbbcd441022b353a65466 (patch)
tree77b3645b375105cb0389df2b4ea5ffa90329f7f8
parent802ea9d8645d33d24b7b4cd4537c14f3e698bde0 (diff)
parent6016daed58ee482a2f7684e93342e89139cf4419 (diff)
downloadlinux-818099574b04c5301eacbbcd441022b353a65466.tar.gz
linux-818099574b04c5301eacbbcd441022b353a65466.tar.bz2
linux-818099574b04c5301eacbbcd441022b353a65466.zip
Merge branch 'akpm' (patches from Andrew)
Merge third set of updates from Andrew Morton: - the rest of MM [ This includes getting rid of the numa hinting bits, in favor of just generic protnone logic. Yay. - Linus ] - core kernel - procfs - some of lib/ (lots of lib/ material this time) * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (104 commits) lib/lcm.c: replace include lib/percpu_ida.c: remove redundant includes lib/strncpy_from_user.c: replace module.h include lib/stmp_device.c: replace module.h include lib/sort.c: move include inside #if 0 lib/show_mem.c: remove redundant include lib/radix-tree.c: change to simpler include lib/plist.c: remove redundant include lib/nlattr.c: remove redundant include lib/kobject_uevent.c: remove redundant include lib/llist.c: remove redundant include lib/md5.c: simplify include lib/list_sort.c: rearrange includes lib/genalloc.c: remove redundant include lib/idr.c: remove redundant include lib/halfmd4.c: simplify includes lib/dynamic_queue_limits.c: simplify includes lib/sort.c: use simpler includes lib/interval_tree.c: simplify includes hexdump: make it return number of bytes placed in buffer ...
-rw-r--r--.gitignore5
-rw-r--r--Documentation/filesystems/proc.txt37
-rw-r--r--arch/alpha/include/asm/thread_info.h5
-rw-r--r--arch/alpha/kernel/signal.c2
-rw-r--r--arch/arc/include/asm/thread_info.h4
-rw-r--r--arch/arc/kernel/signal.c2
-rw-r--r--arch/arm/include/asm/pgtable-3level.h5
-rw-r--r--arch/arm/include/asm/thread_info.h4
-rw-r--r--arch/arm/kernel/signal.c4
-rw-r--r--arch/arm64/include/asm/thread_info.h4
-rw-r--r--arch/arm64/kernel/signal.c2
-rw-r--r--arch/arm64/kernel/signal32.c4
-rw-r--r--arch/avr32/include/asm/thread_info.h4
-rw-r--r--arch/avr32/kernel/asm-offsets.c1
-rw-r--r--arch/avr32/kernel/signal.c2
-rw-r--r--arch/blackfin/include/asm/thread_info.h4
-rw-r--r--arch/blackfin/kernel/signal.c2
-rw-r--r--arch/c6x/include/asm/thread_info.h4
-rw-r--r--arch/c6x/kernel/signal.c2
-rw-r--r--arch/cris/arch-v10/kernel/signal.c2
-rw-r--r--arch/cris/arch-v32/kernel/signal.c2
-rw-r--r--arch/cris/include/asm/thread_info.h4
-rw-r--r--arch/frv/include/asm/string.h1
-rw-r--r--arch/frv/include/asm/thread_info.h4
-rw-r--r--arch/frv/kernel/asm-offsets.c1
-rw-r--r--arch/frv/kernel/signal.c2
-rw-r--r--arch/frv/mm/extable.c23
-rw-r--r--arch/hexagon/include/asm/thread_info.h4
-rw-r--r--arch/hexagon/kernel/signal.c2
-rw-r--r--arch/ia64/include/asm/thread_info.h4
-rw-r--r--arch/ia64/kernel/signal.c2
-rw-r--r--arch/m32r/include/asm/thread_info.h5
-rw-r--r--arch/m32r/kernel/signal.c2
-rw-r--r--arch/m68k/include/asm/thread_info.h4
-rw-r--r--arch/m68k/kernel/signal.c4
-rw-r--r--arch/metag/include/asm/thread_info.h6
-rw-r--r--arch/metag/kernel/signal.c2
-rw-r--r--arch/microblaze/include/asm/thread_info.h4
-rw-r--r--arch/microblaze/kernel/signal.c2
-rw-r--r--arch/mips/include/asm/thread_info.h4
-rw-r--r--arch/mips/kernel/asm-offsets.c1
-rw-r--r--arch/mips/kernel/signal.c2
-rw-r--r--arch/mips/kernel/signal32.c2
-rw-r--r--arch/mn10300/include/asm/thread_info.h4
-rw-r--r--arch/mn10300/kernel/asm-offsets.c1
-rw-r--r--arch/mn10300/kernel/signal.c2
-rw-r--r--arch/openrisc/include/asm/thread_info.h4
-rw-r--r--arch/openrisc/kernel/signal.c2
-rw-r--r--arch/parisc/include/asm/thread_info.h4
-rw-r--r--arch/parisc/kernel/signal.c2
-rw-r--r--arch/powerpc/include/asm/pgtable.h54
-rw-r--r--arch/powerpc/include/asm/pte-common.h5
-rw-r--r--arch/powerpc/include/asm/pte-hash64.h6
-rw-r--r--arch/powerpc/include/asm/thread_info.h4
-rw-r--r--arch/powerpc/kernel/signal_32.c4
-rw-r--r--arch/powerpc/kernel/signal_64.c2
-rw-r--r--arch/powerpc/kernel/time.c32
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c2
-rw-r--r--arch/powerpc/mm/copro_fault.c8
-rw-r--r--arch/powerpc/mm/fault.c25
-rw-r--r--arch/powerpc/mm/pgtable.c11
-rw-r--r--arch/powerpc/mm/pgtable_64.c3
-rw-r--r--arch/s390/include/asm/string.h1
-rw-r--r--arch/s390/include/asm/thread_info.h4
-rw-r--r--arch/s390/kernel/compat_signal.c2
-rw-r--r--arch/s390/kernel/signal.c2
-rw-r--r--arch/score/include/asm/thread_info.h4
-rw-r--r--arch/score/kernel/asm-offsets.c1
-rw-r--r--arch/score/kernel/signal.c2
-rw-r--r--arch/sh/include/asm/thread_info.h4
-rw-r--r--arch/sh/kernel/asm-offsets.c1
-rw-r--r--arch/sh/kernel/signal_32.c4
-rw-r--r--arch/sh/kernel/signal_64.c4
-rw-r--r--arch/sparc/include/asm/thread_info_32.h6
-rw-r--r--arch/sparc/include/asm/thread_info_64.h12
-rw-r--r--arch/sparc/kernel/signal32.c4
-rw-r--r--arch/sparc/kernel/signal_32.c2
-rw-r--r--arch/sparc/kernel/signal_64.c2
-rw-r--r--arch/sparc/kernel/traps_64.c2
-rw-r--r--arch/tile/include/asm/thread_info.h4
-rw-r--r--arch/tile/kernel/signal.c2
-rw-r--r--arch/um/include/asm/thread_info.h4
-rw-r--r--arch/unicore32/include/asm/thread_info.h4
-rw-r--r--arch/unicore32/kernel/signal.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/pgtable.h46
-rw-r--r--arch/x86/include/asm/pgtable_64.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h41
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/rtc.c4
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/mm/gup.c4
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_vrtc.c2
-rw-r--r--arch/x86/um/signal.c2
-rw-r--r--arch/xtensa/include/asm/thread_info.h5
-rw-r--r--arch/xtensa/kernel/signal.c2
-rw-r--r--drivers/acpi/acpica/utdebug.c4
-rw-r--r--drivers/block/xen-blkfront.c2
-rw-r--r--drivers/block/zram/zram_drv.c227
-rw-r--r--drivers/block/zram/zram_drv.h21
-rw-r--r--fs/dcache.c35
-rw-r--r--fs/drop_caches.c14
-rw-r--r--fs/gfs2/quota.c11
-rw-r--r--fs/inode.c15
-rw-r--r--fs/internal.h7
-rw-r--r--fs/proc/array.c34
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/task_mmu.c16
-rw-r--r--fs/select.c2
-rw-r--r--fs/super.c47
-rw-r--r--fs/xfs/xfs_buf.c13
-rw-r--r--fs/xfs/xfs_qm.c12
-rw-r--r--fs/xfs/xfs_super.c7
-rw-r--r--include/acpi/acoutput.h6
-rw-r--r--include/asm-generic/pgtable.h153
-rw-r--r--include/linux/bitmap.h22
-rw-r--r--include/linux/cpumask.h22
-rw-r--r--include/linux/cryptohash.h2
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/init_task.h3
-rw-r--r--include/linux/kernel.h3
-rw-r--r--include/linux/list_lru.h82
-rw-r--r--include/linux/memcontrol.h37
-rw-r--r--include/linux/migrate.h4
-rw-r--r--include/linux/mm.h19
-rw-r--r--include/linux/nodemask.h26
-rw-r--r--include/linux/printk.h6
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/shrinker.h6
-rw-r--r--include/linux/slab.h31
-rw-r--r--include/linux/slab_def.h2
-rw-r--r--include/linux/slub_def.h2
-rw-r--r--include/linux/string.h3
-rw-r--r--include/linux/string_helpers.h4
-rw-r--r--include/linux/swapops.h2
-rw-r--r--include/linux/types.h5
-rw-r--r--include/linux/zpool.h5
-rw-r--r--include/linux/zsmalloc.h2
-rw-r--r--include/net/sock.h5
-rw-r--r--include/uapi/linux/mempolicy.h2
-rw-r--r--kernel/cgroup.c10
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/printk/printk.c12
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/watchdog.c2
-rw-r--r--lib/Kconfig.debug3
-rw-r--r--lib/Makefile4
-rw-r--r--lib/bitmap.c80
-rw-r--r--lib/dynamic_queue_limits.c4
-rw-r--r--lib/genalloc.c3
-rw-r--r--lib/halfmd4.c2
-rw-r--r--lib/hexdump.c105
-rw-r--r--lib/idr.c1
-rw-r--r--lib/interval_tree.c4
-rw-r--r--lib/kobject_uevent.c1
-rw-r--r--lib/lcm.c2
-rw-r--r--lib/list_sort.c7
-rw-r--r--lib/llist.c1
-rw-r--r--lib/md5.c2
-rw-r--r--lib/nlattr.c1
-rw-r--r--lib/percpu_ida.c3
-rw-r--r--lib/plist.c1
-rw-r--r--lib/radix-tree.c2
-rw-r--r--lib/show_mem.c1
-rw-r--r--lib/sort.c6
-rw-r--r--lib/stmp_device.c3
-rw-r--r--lib/string.c8
-rw-r--r--lib/string_helpers.c26
-rw-r--r--lib/strncpy_from_user.c3
-rw-r--r--lib/test-hexdump.c180
-rw-r--r--lib/vsprintf.c12
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/compaction.c23
-rw-r--r--mm/gup.c10
-rw-r--r--mm/huge_memory.c50
-rw-r--r--mm/internal.h6
-rw-r--r--mm/list_lru.c467
-rw-r--r--mm/memcontrol.c188
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c20
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mprotect.c48
-rw-r--r--mm/page_alloc.c19
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slab.h67
-rw-r--r--mm/slab_common.c197
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c117
-rw-r--r--mm/vmscan.c85
-rw-r--r--mm/workingset.c9
-rw-r--r--mm/zbud.c3
-rw-r--r--mm/zpool.c6
-rw-r--r--mm/zsmalloc.c239
-rw-r--r--mm/zswap.c5
-rw-r--r--net/ipv4/tcp_memcontrol.c4
207 files changed, 2214 insertions, 1420 deletions
diff --git a/.gitignore b/.gitignore
index ce57b79670a5..9ac91060ea64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,11 @@ Module.symvers
/debian/
#
+# tar directory (make tar*-pkg)
+#
+/tar-install/
+
+#
# git files that we don't want to ignore even it they are dot-files
#
!.gitignore
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cf8fc2f0b34b..a07ba61662ed 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -145,6 +145,8 @@ Table 1-1: Process specific entries in /proc
stack Report full stack trace, enable via CONFIG_STACKTRACE
smaps a extension based on maps, showing the memory consumption of
each mapping and flags associated with it
+ numa_maps an extension based on maps, showing the memory locality and
+ binding policy as well as mem usage (in pages) of each mapping.
..............................................................................
For example, to get the status information of a process, all you have to do is
@@ -489,12 +491,47 @@ To clear the bits for the file mapped pages associated with the process
To clear the soft-dirty bit
> echo 4 > /proc/PID/clear_refs
+To reset the peak resident set size ("high water mark") to the process's
+current value:
+ > echo 5 > /proc/PID/clear_refs
+
Any other value written to /proc/PID/clear_refs will have no effect.
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
using /proc/kpageflags and number of times a page is mapped using
/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
+The /proc/pid/numa_maps is an extension based on maps, showing the memory
+locality and binding policy, as well as the memory usage (in pages) of
+each mapping. The output follows a general format where mapping details get
+summarized separated by blank spaces, one mapping per each file line:
+
+address policy mapping details
+
+00400000 default file=/usr/local/bin/app mapped=1 active=0 N3=1 kernelpagesize_kB=4
+00600000 default file=/usr/local/bin/app anon=1 dirty=1 N3=1 kernelpagesize_kB=4
+3206000000 default file=/lib64/ld-2.12.so mapped=26 mapmax=6 N0=24 N3=2 kernelpagesize_kB=4
+320621f000 default file=/lib64/ld-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4
+3206220000 default file=/lib64/ld-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4
+3206221000 default anon=1 dirty=1 N3=1 kernelpagesize_kB=4
+3206800000 default file=/lib64/libc-2.12.so mapped=59 mapmax=21 active=55 N0=41 N3=18 kernelpagesize_kB=4
+320698b000 default file=/lib64/libc-2.12.so
+3206b8a000 default file=/lib64/libc-2.12.so anon=2 dirty=2 N3=2 kernelpagesize_kB=4
+3206b8e000 default file=/lib64/libc-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4
+3206b8f000 default anon=3 dirty=3 active=1 N3=3 kernelpagesize_kB=4
+7f4dc10a2000 default anon=3 dirty=3 N3=3 kernelpagesize_kB=4
+7f4dc10b4000 default anon=2 dirty=2 active=1 N3=2 kernelpagesize_kB=4
+7f4dc1200000 default file=/anon_hugepage\040(deleted) huge anon=1 dirty=1 N3=1 kernelpagesize_kB=2048
+7fff335f0000 default stack anon=3 dirty=3 N3=3 kernelpagesize_kB=4
+7fff3369d000 default mapped=1 mapmax=35 active=0 N3=1 kernelpagesize_kB=4
+
+Where:
+"address" is the starting address for the mapping;
+"policy" reports the NUMA memory policy set for the mapping (see vm/numa_memory_policy.txt);
+"mapping details" summarizes mapping data such as mapping type, page usage counters,
+node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page
+size, in KB, that is backing the mapping up.
+
1.2 Kernel data
---------------
diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 48bbea6898b3..d5b98ab514bb 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -27,8 +27,6 @@ struct thread_info {
int bpt_nsaved;
unsigned long bpt_addr[2]; /* breakpoint handling */
unsigned int bpt_insn[2];
-
- struct restart_block restart_block;
};
/*
@@ -40,9 +38,6 @@ struct thread_info {
.exec_domain = &default_exec_domain, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 6cec2881acbf..8dbfb15f1745 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -150,7 +150,7 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
struct switch_stack *sw = (struct switch_stack *)regs - 1;
long i, err = __get_user(regs->pc, &sc->sc_pc);
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
sw->r26 = (unsigned long) ret_from_sys_call;
diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h
index 02bc5ec0fb2e..1163a1838ac1 100644
--- a/arch/arc/include/asm/thread_info.h
+++ b/arch/arc/include/asm/thread_info.h
@@ -46,7 +46,6 @@ struct thread_info {
struct exec_domain *exec_domain;/* execution domain */
__u32 cpu; /* current CPU */
unsigned long thr_ptr; /* TLS ptr */
- struct restart_block restart_block;
};
/*
@@ -62,9 +61,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index cb3142a2d40b..114234e83caa 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
struct pt_regs *regs = current_pt_regs();
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* Since we stacked the signal on a word boundary,
* then 'sp' should be word aligned here. If it's
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 18dbc82f85e5..423a5ac09d3a 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -257,7 +257,10 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
/* represent a notpresent pmd by zero, this is used by pmdp_invalidate */
-#define pmd_mknotpresent(pmd) (__pmd(0))
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return __pmd(0);
+}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index d890e41f5520..72812a1f3d1c 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -68,7 +68,6 @@ struct thread_info {
#ifdef CONFIG_ARM_THUMBEE
unsigned long thumbee_state; /* ThumbEE Handler Base register */
#endif
- struct restart_block restart_block;
};
#define INIT_THREAD_INFO(tsk) \
@@ -81,9 +80,6 @@ struct thread_info {
.cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
domain_val(DOMAIN_IO, DOMAIN_CLIENT), \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 8aa6f1b87c9e..023ac905e4c3 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -191,7 +191,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs)
struct sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
@@ -221,7 +221,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
struct rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 459bf8e53208..702e1e6a0d80 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -48,7 +48,6 @@ struct thread_info {
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
struct exec_domain *exec_domain; /* execution domain */
- struct restart_block restart_block;
int preempt_count; /* 0 => preemptable, <0 => bug */
int cpu; /* cpu */
};
@@ -60,9 +59,6 @@ struct thread_info {
.flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6fa792137eda..660ccf9f7524 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -131,7 +131,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
struct rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 128-bit boundary, then 'sp' should
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e299de396e9b..c20a300e2213 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -347,7 +347,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs)
struct compat_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
@@ -381,7 +381,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs)
struct compat_rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index a978f3fe7c25..d56afa99a514 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -30,7 +30,6 @@ struct thread_info {
saved by debug handler
when setting up
trampoline */
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -41,9 +40,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall \
- } \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c
index d6a8193a1d2f..e41c84516e5d 100644
--- a/arch/avr32/kernel/asm-offsets.c
+++ b/arch/avr32/kernel/asm-offsets.c
@@ -18,7 +18,6 @@ void foo(void)
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_rar_saved, thread_info, rar_saved);
OFFSET(TI_rsr_saved, thread_info, rsr_saved);
- OFFSET(TI_restart_block, thread_info, restart_block);
BLANK();
OFFSET(TSK_active_mm, task_struct, active_mm);
BLANK();
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index d309fbcc3bd6..8f1c63b9b983 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -69,7 +69,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
sigset_t set;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
frame = (struct rt_sigframe __user *)regs->sp;
pr_debug("SIG return: frame = %p\n", frame);
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index 55f473bdad36..57c3a8bd583d 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
int cpu; /* cpu we're on */
int preempt_count; /* 0 => preemptable, <0 => BUG */
mm_segment_t addr_limit; /* address limit */
- struct restart_block restart_block;
#ifndef CONFIG_SMP
struct l1_scratch_task_info l1_task_info;
#endif
@@ -58,9 +57,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index ef275571d885..f2a8b5493bd3 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -44,7 +44,7 @@ rt_restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *p
int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
#define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x)
diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h
index d4e9ef87076d..584e253f3217 100644
--- a/arch/c6x/include/asm/thread_info.h
+++ b/arch/c6x/include/asm/thread_info.h
@@ -45,7 +45,6 @@ struct thread_info {
int cpu; /* cpu we're on */
int preempt_count; /* 0 = preemptable, <0 = BUG */
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
};
/*
@@ -61,9 +60,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index fe68226f6c4d..3c4bb5a5c382 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -68,7 +68,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
sigset_t set;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a dword boundary,
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 9b32d338838b..74d7ba35120d 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -67,7 +67,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
unsigned long old_usp;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* restore the regs from &sc->regs (same as sc, since regs is first)
* (sc is already checked for VERIFY_READ since the sigframe was
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 78ce3b1c9bcb..870e3e069318 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -59,7 +59,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
unsigned long old_usp;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Restore the registers from &sc->regs. sc is already checked
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index 55dede18c032..7286db5ed90e 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -38,7 +38,6 @@ struct thread_info {
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -56,9 +55,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/frv/include/asm/string.h b/arch/frv/include/asm/string.h
index 5ed310f64b7e..1f6c35990439 100644
--- a/arch/frv/include/asm/string.h
+++ b/arch/frv/include/asm/string.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *, const void *, __kernel_size_t);
#define __HAVE_ARCH_STRNCAT 1
#define __HAVE_ARCH_STRCMP 1
#define __HAVE_ARCH_STRNCMP 1
-#define __HAVE_ARCH_STRNICMP 1
#define __HAVE_ARCH_STRCHR 1
#define __HAVE_ARCH_STRRCHR 1
#define __HAVE_ARCH_STRSTR 1
diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index af29e17c0181..6b917f1c2955 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -41,7 +41,6 @@ struct thread_info {
* 0-0xBFFFFFFF for user-thead
* 0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -65,9 +64,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/frv/kernel/asm-offsets.c b/arch/frv/kernel/asm-offsets.c
index 9de96843a278..446e89d500cc 100644
--- a/arch/frv/kernel/asm-offsets.c
+++ b/arch/frv/kernel/asm-offsets.c
@@ -40,7 +40,6 @@ void foo(void)
OFFSET(TI_CPU, thread_info, cpu);
OFFSET(TI_PREEMPT_COUNT, thread_info, preempt_count);
OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
- OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
BLANK();
/* offsets into register file storage */
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index dc3d59de0870..336713ab4745 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -62,7 +62,7 @@ static int restore_sigcontext(struct sigcontext __user *sc, int *_gr8)
unsigned long tbr, psr;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
tbr = user->i.tbr;
psr = user->i.psr;
diff --git a/arch/frv/mm/extable.c b/arch/frv/mm/extable.c
index 2fb9b3ab57b9..8863d6c1df6e 100644
--- a/arch/frv/mm/extable.c
+++ b/arch/frv/mm/extable.c
@@ -10,29 +10,6 @@ extern const void __memset_end, __memset_user_error_lr, __memset_user_error_hand
extern const void __memcpy_end, __memcpy_user_error_lr, __memcpy_user_error_handler;
extern spinlock_t modlist_lock;
-/*****************************************************************************/
-/*
- *
- */
-static inline unsigned long search_one_table(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- while (first <= last) {
- const struct exception_table_entry __attribute__((aligned(8))) *mid;
- long diff;
-
- mid = (last - first) / 2 + first;
- diff = mid->insn - value;
- if (diff == 0)
- return mid->fixup;
- else if (diff < 0)
- first = mid + 1;
- else
- last = mid - 1;
- }
- return 0;
-} /* end search_one_table() */
/*****************************************************************************/
/*
diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h
index a59dad3b3695..bacd3d6030c5 100644
--- a/arch/hexagon/include/asm/thread_info.h
+++ b/arch/hexagon/include/asm/thread_info.h
@@ -56,7 +56,6 @@ struct thread_info {
* used for syscalls somehow;
* seems to have a function pointer and four arguments
*/
- struct restart_block restart_block;
/* Points to the current pt_regs frame */
struct pt_regs *regs;
/*
@@ -83,9 +82,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.sp = 0, \
.regs = NULL, \
}
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index eadd70e47e7e..b039a624c170 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -239,7 +239,7 @@ asmlinkage int sys_rt_sigreturn(void)
sigset_t blocked;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
frame = (struct rt_sigframe __user *)pt_psp(regs);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 5b17418b4223..c16f21a068ff 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -27,7 +27,6 @@ struct thread_info {
__u32 status; /* Thread synchronous flags */
mm_segment_t addr_limit; /* user-level address space limit */
int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */
- struct restart_block restart_block;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
__u64 ac_stamp;
__u64 ac_leave;
@@ -46,9 +45,6 @@ struct thread_info {
.cpu = 0, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#ifndef ASM_OFFSETS_C
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 6d92170be457..b3a124da71e5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -46,7 +46,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
long err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* restore scratch that always needs gets updated during signal delivery: */
err = __get_user(flags, &sc->sc_flags);
diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h
index 00171703402f..32422d0211c3 100644
--- a/arch/m32r/include/asm/thread_info.h
+++ b/arch/m32r/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
0-0xBFFFFFFF for user-thread
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -49,7 +48,6 @@ struct thread_info {
#define TI_CPU 0x00000010
#define TI_PRE_COUNT 0x00000014
#define TI_ADDR_LIMIT 0x00000018
-#define TI_RESTART_BLOCK 0x000001C
#endif
@@ -68,9 +66,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 95408b8f130a..7736c6660a15 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -48,7 +48,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
#define COPY(x) err |= __get_user(regs->x, &sc->sc_##x)
COPY(r4);
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 21a4784ca5a1..c54256e69e64 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
int preempt_count; /* 0 => preemptable, <0 => BUG */
__u32 cpu; /* should always be 0 on m68k */
unsigned long tp_value; /* thread pointer */
- struct restart_block restart_block;
};
#endif /* __ASSEMBLY__ */
@@ -41,9 +40,6 @@ struct thread_info {
.exec_domain = &default_exec_domain, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_stack (init_thread_union.stack)
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 967a8b7e1527..d7179281e74a 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -655,7 +655,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u
int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* get previous context */
if (copy_from_user(&context, usc, sizeof(context)))
@@ -693,7 +693,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw,
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err = __get_user(temp, &uc->uc_mcontext.version);
if (temp != MCONTEXT_VERSION)
diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h
index 47711336119e..afb3ca4776d1 100644
--- a/arch/metag/include/asm/thread_info.h
+++ b/arch/metag/include/asm/thread_info.h
@@ -35,9 +35,8 @@ struct thread_info {
int preempt_count; /* 0 => preemptable, <0 => BUG */
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
- u8 supervisor_stack[0];
+ u8 supervisor_stack[0] __aligned(8);
};
#else /* !__ASSEMBLY__ */
@@ -74,9 +73,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
index 0d100d5c1407..ce49d429c74a 100644
--- a/arch/metag/kernel/signal.c
+++ b/arch/metag/kernel/signal.c
@@ -48,7 +48,7 @@ static int restore_sigcontext(struct pt_regs *regs,
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL,
&sc->regs);
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 8c9d36591a03..b699fbd7de4a 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -71,7 +71,6 @@ struct thread_info {
__u32 cpu; /* current CPU */
__s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
struct cpu_context cpu_context;
};
@@ -87,9 +86,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 235706055b7f..a1cbaf90e2ea 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -89,7 +89,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
int rval;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index e4440f92b366..9e1295f874f0 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
* 0x7fffffff for user-thead
* 0xffffffff for kernel-thread
*/
- struct restart_block restart_block;
struct pt_regs *regs;
long syscall; /* syscall number */
};
@@ -50,9 +49,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index b1d84bd4efb3..3b2dfdb4865f 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -98,7 +98,6 @@ void output_thread_info_defines(void)
OFFSET(TI_CPU, thread_info, cpu);
OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
- OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
OFFSET(TI_REGS, thread_info, regs);
DEFINE(_THREAD_SIZE, THREAD_SIZE);
DEFINE(_THREAD_MASK, THREAD_MASK);
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 545bf11bd2ed..6a28c792d862 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -243,7 +243,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
int i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err |= __get_user(regs->cp0_epc, &sc->sc_pc);
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index d69179c0d49d..19a7705f2a01 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -220,7 +220,7 @@ static int restore_sigcontext32(struct pt_regs *regs,
int i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err |= __get_user(regs->cp0_epc, &sc->sc_pc);
err |= __get_user(regs->hi, &sc->sc_mdhi);
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index bf280eaccd36..c1c374f0ec12 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -50,7 +50,6 @@ struct thread_info {
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -80,9 +79,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c
index 47b3bb0c04ff..d780670cbaf3 100644
--- a/arch/mn10300/kernel/asm-offsets.c
+++ b/arch/mn10300/kernel/asm-offsets.c
@@ -28,7 +28,6 @@ void foo(void)
OFFSET(TI_cpu, thread_info, cpu);
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
- OFFSET(TI_restart_block, thread_info, restart_block);
BLANK();
OFFSET(REG_D0, pt_regs, d0);
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index a6c0858592c3..8609845f12c5 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -40,7 +40,7 @@ static int restore_sigcontext(struct pt_regs *regs,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (is_using_fpu(current))
fpu_kill_state(current);
diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h
index d797acc901e4..875f0845a707 100644
--- a/arch/openrisc/include/asm/thread_info.h
+++ b/arch/openrisc/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
0-0x7FFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
/* saved context data */
@@ -79,9 +78,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.ksp = 0, \
}
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 7d1b8235bf90..4112175bf803 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -46,7 +46,7 @@ static int restore_sigcontext(struct pt_regs *regs,
int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Restore the regs from &sc->regs.
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index a84611835549..fb13e3865563 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -14,7 +14,6 @@ struct thread_info {
mm_segment_t addr_limit; /* user-level address space limit */
__u32 cpu; /* current CPU */
int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */
- struct restart_block restart_block;
};
#define INIT_THREAD_INFO(tsk) \
@@ -25,9 +24,6 @@ struct thread_info {
.cpu = 0, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall \
- } \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 012d4fa63d97..9b910a0251b8 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -99,7 +99,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
#endif
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* Unwind the user stack to get the rt_sigframe structure. */
frame = (struct rt_sigframe __user *)
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7e77f2ca5132..79fee2eb8d56 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -40,63 +40,27 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK)
static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
#ifdef CONFIG_NUMA_BALANCING
-static inline int pte_present(pte_t pte)
-{
- return pte_val(pte) & _PAGE_NUMA_MASK;
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t pte)
-{
- return pte_val(pte) & (_PAGE_PRESENT);
-}
-
-#define ptep_set_numa ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- if ((pte_val(*ptep) & _PAGE_PRESENT) == 0)
- VM_BUG_ON(1);
-
- pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
- return;
-}
-
-#define pmdp_set_numa pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0)
- VM_BUG_ON(1);
-
- pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
- return;
-}
-
/*
- * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist
- * which was inherited from x86. For the purposes of powerpc pte_basic_t and
- * pmd_t are equivalent
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h . On powerpc, this will only
+ * work for user pages and always return true for kernel pages.
*/
-#define pteval_t pte_basic_t
-#define pmdval_t pmd_t
-static inline pteval_t ptenuma_flags(pte_t pte)
+static inline int pte_protnone(pte_t pte)
{
- return pte_val(pte) & _PAGE_NUMA_MASK;
+ return (pte_val(pte) &
+ (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
}
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
+static inline int pmd_protnone(pmd_t pmd)
{
- return pmd_val(pmd) & _PAGE_NUMA_MASK;
+ return pte_protnone(pmd_pte(pmd));
}
-
-# else
+#endif /* CONFIG_NUMA_BALANCING */
static inline int pte_present(pte_t pte)
{
return pte_val(pte) & _PAGE_PRESENT;
}
-#endif /* CONFIG_NUMA_BALANCING */
/* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index 2aef9b7a0eb2..c5a755ef7011 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -104,11 +104,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
_PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \
_PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
-#ifdef CONFIG_NUMA_BALANCING
-/* Mask of bits that distinguish present and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT)
-#endif
-
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
* cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 2505d8eab15c..55aea0caf95e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,12 +27,6 @@
#define _PAGE_RW 0x0200 /* software: user write access allowed */
#define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */
-/*
- * Used for tracking numa faults
- */
-#define _PAGE_NUMA 0x00000010 /* Gather numa placement stats */
-
-
/* No separate kernel read-only */
#define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
#define _PAGE_KERNEL_RO _PAGE_KERNEL_RW
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index e8abc83e699f..72489799cf02 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,7 +43,6 @@ struct thread_info {
int cpu; /* cpu we're on */
int preempt_count; /* 0 => preemptable,
<0 => BUG */
- struct restart_block restart_block;
unsigned long local_flags; /* private flags for thread */
/* low level flags - has atomic operations done on it */
@@ -59,9 +58,6 @@ struct thread_info {
.exec_domain = &default_exec_domain, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.flags = 0, \
}
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index b171001698ff..d3a831ac0f92 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1231,7 +1231,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
int tm_restore = 0;
#endif
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
rt_sf = (struct rt_sigframe __user *)
(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
@@ -1504,7 +1504,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
#endif
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
sc = &sf->sctx;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 2cb0c94cafa5..c7c24d2e2bdb 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -666,7 +666,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
#endif
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
goto badframe;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index fa7c4f12104f..7316dd15278a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -621,6 +621,38 @@ unsigned long long sched_clock(void)
return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
}
+
+#ifdef CONFIG_PPC_PSERIES
+
+/*
+ * Running clock - attempts to give a view of time passing for a virtualised
+ * kernels.
+ * Uses the VTB register if available otherwise a next best guess.
+ */
+unsigned long long running_clock(void)
+{
+ /*
+ * Don't read the VTB as a host since KVM does not switch in host
+ * timebase into the VTB when it takes a guest off the CPU, reading the
+ * VTB would result in reading 'last switched out' guest VTB.
+ *
+ * Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it
+ * would be unsafe to rely only on the #ifdef above.
+ */
+ if (firmware_has_feature(FW_FEATURE_LPAR) &&
+ cpu_has_feature(CPU_FTR_ARCH_207S))
+ return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+
+ /*
+ * This is a next best approximation without a VTB.
+ * On a host which is running bare metal there should never be any stolen
+ * time and on a host which doesn't do any virtualisation TB *should* equal
+ * VTB so it makes no difference anyway.
+ */
+ return local_clock() - cputime_to_nsecs(kcpustat_this_cpu->cpustat[CPUTIME_STEAL]);
+}
+#endif
+
static int __init get_freq(char *name, int cells, unsigned long *val)
{
struct device_node *cpu;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 510bdfbc4073..625407e4d3b0 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -212,7 +212,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Look up the Linux PTE for the backing page */
pte_size = psize;
pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
- if (pte_present(pte) && !pte_numa(pte)) {
+ if (pte_present(pte) && !pte_protnone(pte)) {
if (writing && !pte_write(pte))
/* make the actual HPTE be read-only */
ptel = hpte_make_readonly(ptel);
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 1b5305d4bdab..f031a47d7701 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -64,10 +64,14 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock;
} else {
- if (dsisr & DSISR_PROTFAULT)
- goto out_unlock;
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto out_unlock;
+ /*
+ * protfault should only happen due to us
+ * mapping a region readonly temporarily. PROT_NONE
+ * is also covered by the VMA check above.
+ */
+ WARN_ON_ONCE(dsisr & DSISR_PROTFAULT);
}
ret = 0;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 6154b0a2b063..b396868d2aa7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -389,19 +389,6 @@ good_area:
#endif /* CONFIG_8xx */
if (is_exec) {
-#ifdef CONFIG_PPC_STD_MMU
- /* Protection fault on exec go straight to failure on
- * Hash based MMUs as they either don't support per-page
- * execute permission, or if they do, it's handled already
- * at the hash level. This test would probably have to
- * be removed if we change the way this works to make hash
- * processors use the same I/D cache coherency mechanism
- * as embedded.
- */
- if (error_code & DSISR_PROTFAULT)
- goto bad_area;
-#endif /* CONFIG_PPC_STD_MMU */
-
/*
* Allow execution from readable areas if the MMU does not
* provide separate controls over reading and executing.
@@ -416,6 +403,14 @@ good_area:
(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
!(vma->vm_flags & (VM_READ | VM_WRITE))))
goto bad_area;
+#ifdef CONFIG_PPC_STD_MMU
+ /*
+ * protfault should only happen due to us
+ * mapping a region readonly temporarily. PROT_NONE
+ * is also covered by the VMA check above.
+ */
+ WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+#endif /* CONFIG_PPC_STD_MMU */
/* a write */
} else if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
@@ -423,11 +418,9 @@ good_area:
flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
- /* protection fault */
- if (error_code & 0x08000000)
- goto bad_area;
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
+ WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
}
/*
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index c90e602677c9..83dfcb55ffef 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -172,9 +172,14 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte)
{
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
-#endif
+ /*
+ * When handling numa faults, we already have the pte marked
+ * _PAGE_PRESENT, but we can be sure that it is not in hpte.
+ * Hence we can use set_pte_at for them.
+ */
+ VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) ==
+ (_PAGE_PRESENT | _PAGE_USER));
+
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
* is called.
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 4fe5f64cc179..91bb8836825a 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -718,7 +718,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_DEBUG_VM
- WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
+ WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
+ (_PAGE_PRESENT | _PAGE_USER));
assert_spin_locked(&mm->page_table_lock);
WARN_ON(!pmd_trans_huge(pmd));
#endif
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 7e2dcd7c57ef..8662f5c8e17f 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -44,7 +44,6 @@ extern char *strstr(const char *, const char *);
#undef __HAVE_ARCH_STRCHR
#undef __HAVE_ARCH_STRNCHR
#undef __HAVE_ARCH_STRNCMP
-#undef __HAVE_ARCH_STRNICMP
#undef __HAVE_ARCH_STRPBRK
#undef __HAVE_ARCH_STRSEP
#undef __HAVE_ARCH_STRSPN
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 4d62fd5b56e5..ef1df718642d 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -39,7 +39,6 @@ struct thread_info {
unsigned long sys_call_table; /* System call table address */
unsigned int cpu; /* current CPU */
int preempt_count; /* 0 => preemptable, <0 => BUG */
- struct restart_block restart_block;
unsigned int system_call;
__u64 user_timer;
__u64 system_timer;
@@ -56,9 +55,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 34d5fa7b01b5..bc1df12dd4f8 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -209,7 +209,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
int i;
/* Alwys make any pending restarted system call return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
return -EFAULT;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 6a2ac257d98f..b3ae6f70c6d6 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -162,7 +162,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
_sigregs user_sregs;
/* Alwys make any pending restarted system call return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
return -EFAULT;
diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h
index 656b7ada9326..33864fa2a8d4 100644
--- a/arch/score/include/asm/thread_info.h
+++ b/arch/score/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
* 0-0xFFFFFFFF for kernel-thread
*/
mm_segment_t addr_limit;
- struct restart_block restart_block;
struct pt_regs *regs;
};
@@ -58,9 +57,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/score/kernel/asm-offsets.c b/arch/score/kernel/asm-offsets.c
index 57788f44c6fb..b4d5214a7a7e 100644
--- a/arch/score/kernel/asm-offsets.c
+++ b/arch/score/kernel/asm-offsets.c
@@ -106,7 +106,6 @@ void output_thread_info_defines(void)
OFFSET(TI_CPU, thread_info, cpu);
OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
- OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
OFFSET(TI_REGS, thread_info, regs);
DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE);
DEFINE(KERNEL_STACK_MASK, THREAD_MASK);
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 1651807774ad..e381c8c4ff65 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -141,7 +141,7 @@ score_rt_sigreturn(struct pt_regs *regs)
int sig;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
frame = (struct rt_sigframe __user *) regs->regs[0];
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index ad27ffa65e2e..657c03919627 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -33,7 +33,6 @@ struct thread_info {
__u32 cpu;
int preempt_count; /* 0 => preemptable, <0 => BUG */
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
unsigned long previous_sp; /* sp of previous stack in case
of nested IRQ stacks */
__u8 supervisor_stack[0];
@@ -63,9 +62,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
index 08a2be775b6c..542225fedb11 100644
--- a/arch/sh/kernel/asm-offsets.c
+++ b/arch/sh/kernel/asm-offsets.c
@@ -25,7 +25,6 @@ int main(void)
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
- DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block));
DEFINE(TI_SIZE, sizeof(struct thread_info));
#ifdef CONFIG_HIBERNATION
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2f002b24fb92..0b34f2a704fe 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -156,7 +156,7 @@ asmlinkage int sys_sigreturn(void)
int r0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
@@ -186,7 +186,7 @@ asmlinkage int sys_rt_sigreturn(void)
int r0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 897abe7b871e..71993c6a7d94 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -260,7 +260,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
long long ret;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
@@ -294,7 +294,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
long long ret;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 025c98446b1e..fd7bd0a440ca 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -47,8 +47,6 @@ struct thread_info {
struct reg_window32 reg_window[NSWINS]; /* align for ldd! */
unsigned long rwbuf_stkptrs[NSWINS];
unsigned long w_saved;
-
- struct restart_block restart_block;
};
/*
@@ -62,9 +60,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
@@ -103,7 +98,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
#define TI_REG_WINDOW 0x30
#define TI_RWIN_SPTRS 0x230
#define TI_W_SAVED 0x250
-/* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */
/*
* thread information flag bit numbers
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 798f0279a4b5..ff455164732a 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -58,8 +58,6 @@ struct thread_info {
unsigned long gsr[7];
unsigned long xfsr[7];
- struct restart_block restart_block;
-
struct pt_regs *kern_una_regs;
unsigned int kern_una_insn;
@@ -92,10 +90,9 @@ struct thread_info {
#define TI_RWIN_SPTRS 0x000003c8
#define TI_GSR 0x00000400
#define TI_XFSR 0x00000438
-#define TI_RESTART_BLOCK 0x00000470
-#define TI_KUNA_REGS 0x000004a0
-#define TI_KUNA_INSN 0x000004a8
-#define TI_FPREGS 0x000004c0
+#define TI_KUNA_REGS 0x00000470
+#define TI_KUNA_INSN 0x00000478
+#define TI_FPREGS 0x00000480
/* We embed this in the uppermost byte of thread_info->flags */
#define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */
@@ -124,9 +121,6 @@ struct thread_info {
.current_ds = ASI_P, \
.exec_domain = &default_exec_domain, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 62deba7be1a9..4eed773a7735 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -150,7 +150,7 @@ void do_sigreturn32(struct pt_regs *regs)
int err, i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack();
@@ -235,7 +235,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
int err, i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack();
regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 9ee72fc8e0e4..52aa5e4ce5e7 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -70,7 +70,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack();
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 1a6999868031..d88beff47bab 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -254,7 +254,7 @@ void do_rt_sigreturn(struct pt_regs *regs)
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack ();
sf = (struct rt_signal_frame __user *)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 981a769b9558..a27651e866e7 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2730,8 +2730,6 @@ void __init trap_init(void)
TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
TI_CURRENT_DS != offsetof(struct thread_info,
current_ds) ||
- TI_RESTART_BLOCK != offsetof(struct thread_info,
- restart_block) ||
TI_KUNA_REGS != offsetof(struct thread_info,
kern_una_regs) ||
TI_KUNA_INSN != offsetof(struct thread_info,
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 48e4fd0f38e4..96c14c1430d8 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -36,7 +36,6 @@ struct thread_info {
mm_segment_t addr_limit; /* thread address space
(KERNEL_DS or USER_DS) */
- struct restart_block restart_block;
struct single_step_state *step_state; /* single step state
(if non-zero) */
int align_ctl; /* controls unaligned access */
@@ -57,9 +56,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.step_state = NULL, \
.align_ctl = 0, \
}
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index bb0a9ce7ae23..8a524e332c1a 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -48,7 +48,7 @@ int restore_sigcontext(struct pt_regs *regs,
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Enforce that sigcontext is like pt_regs, and doesn't mess
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index 1c5b2a83046a..e04114c4fcd9 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -22,7 +22,6 @@ struct thread_info {
mm_segment_t addr_limit; /* thread address space:
0-0xBFFFFFFF for user
0-0xFFFFFFFF for kernel */
- struct restart_block restart_block;
struct thread_info *real_thread; /* Points to non-IRQ stack */
};
@@ -34,9 +33,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.real_thread = NULL, \
}
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
index af36d8eabdf1..63e2839dfeb8 100644
--- a/arch/unicore32/include/asm/thread_info.h
+++ b/arch/unicore32/include/asm/thread_info.h
@@ -79,7 +79,6 @@ struct thread_info {
#ifdef CONFIG_UNICORE_FPU_F64
struct fp_state fpstate __attribute__((aligned(8)));
#endif
- struct restart_block restart_block;
};
#define INIT_THREAD_INFO(tsk) \
@@ -89,9 +88,6 @@ struct thread_info {
.flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 7c8fb7018dc6..d329f85766cc 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -105,7 +105,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
struct rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9e181aaba97..d0165c9a2932 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
u32 tmp;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
get_user_try {
/*
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0fe03f834fb1..67fc3d2b0aab 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -132,13 +132,7 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte)
{
- /*
- * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
- * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
- * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
- */
- return (pte_flags(pte) & _PAGE_SPECIAL) &&
- (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
+ return pte_flags(pte) & _PAGE_SPECIAL;
}
static inline unsigned long pte_pfn(pte_t pte)
@@ -300,7 +294,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_PRESENT);
+ return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -443,13 +437,6 @@ static inline int pte_same(pte_t a, pte_t b)
static inline int pte_present(pte_t a)
{
- return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
- _PAGE_NUMA);
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t a)
-{
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
@@ -459,7 +446,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
if (pte_flags(a) & _PAGE_PRESENT)
return true;
- if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+ if ((pte_flags(a) & _PAGE_PROTNONE) &&
mm_tlb_flush_pending(mm))
return true;
@@ -479,10 +466,25 @@ static inline int pmd_present(pmd_t pmd)
* the _PAGE_PSE flag will remain set at all times while the
* _PAGE_PRESENT bit is clear).
*/
- return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
- _PAGE_NUMA);
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_PROTNONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_PROTNONE;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline int pmd_none(pmd_t pmd)
{
/* Only check low word on 32-bit platforms, since it might be
@@ -539,11 +541,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
static inline int pmd_bad(pmd_t pmd)
{
-#ifdef CONFIG_NUMA_BALANCING
- /* pmd_numa check */
- if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
- return 0;
-#endif
return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
}
@@ -862,19 +859,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present_nonuma(pte));
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
static inline int pte_swp_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present_nonuma(pte));
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present_nonuma(pte));
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
#endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e227970f983e..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* Encode and de-code a swap entry */
#define SWP_TYPE_BITS 5
-#ifdef CONFIG_NUMA_BALANCING
-/* Automatic NUMA balancing needs to be distinguishable from swap entries */
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
-#else
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#endif
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3e0230c94cff..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -27,14 +27,6 @@
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
-/*
- * Swap offsets on configurations that allow automatic NUMA balancing use the
- * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
- * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
- * maximum possible swap space from 16TB to 8TB.
- */
-#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
-
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -76,21 +68,6 @@
#endif
/*
- * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
- * that is not present. The hinting fault gathers numa placement statistics
- * (see pte_numa()). The bit is always zero when the PTE is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- */
-#ifdef CONFIG_NUMA_BALANCING
-#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
-#else
-#define _PAGE_NUMA (_AT(pteval_t, 0))
-#endif
-
-/*
* Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict
* with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -122,8 +99,8 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY | _PAGE_NUMA)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
+ _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
* The cache modes defined here are used to translate between pure SW usage
@@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
-#ifdef CONFIG_NUMA_BALANCING
-/* Set of bits that distinguishes present, prot_none and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
- return pmd_flags(pmd) & _PAGE_NUMA_MASK;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e82e95abc92b..1d4e4f279a32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
__u32 cpu; /* current CPU */
int saved_preempt_count;
mm_segment_t addr_limit;
- struct restart_block restart_block;
void __user *sysenter_return;
unsigned int sig_on_uaccess_error:1;
unsigned int uaccess_err:1; /* uaccess failed */
@@ -45,9 +44,6 @@ struct thread_info {
.cpu = 0, \
.saved_preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
#define hpet_print_config() \
do { \
if (hpet_verbose) \
- _hpet_print_config(__FUNCTION__, __LINE__); \
+ _hpet_print_config(__func__, __LINE__); \
} while (0)
/*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index fe3dbfe0c4a5..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
retval = set_rtc_time(&tm);
if (retval)
printk(KERN_ERR "%s: RTC write failed with error %d\n",
- __FUNCTION__, retval);
+ __func__, retval);
} else {
printk(KERN_ERR
"%s: Invalid RTC value: write of %lx to RTC failed\n",
- __FUNCTION__, nowtime);
+ __func__, nowtime);
retval = -EINVAL;
}
return retval;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2a33c8f68319..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
get_user_try {
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 89df70e0caa6..81bf3d2af3eb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
struct page *page;
/* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_numa(pte)) {
+ if (pte_protnone(pte)) {
pte_unmap(ptep);
return 0;
}
@@ -178,7 +178,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
- if (pmd_numa(pmd))
+ if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 4762cff7facd..32947ba0f62d 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now)
spin_unlock_irqrestore(&rtc_lock, flags);
} else {
pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
- __FUNCTION__, now->tv_sec);
+ __func__, now->tv_sec);
retval = -EINVAL;
}
return retval;
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 79d824551c1a..0c8c32bfd792 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
int err, pid;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err = copy_from_user(&sc, from, sizeof(sc));
if (err)
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index 470153e8547c..a9b5d3ba196c 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -51,7 +51,6 @@ struct thread_info {
__s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
unsigned long cpenable;
@@ -72,7 +71,6 @@ struct thread_info {
#define TI_CPU 0x00000010
#define TI_PRE_COUNT 0x00000014
#define TI_ADDR_LIMIT 0x00000018
-#define TI_RESTART_BLOCK 0x000001C
#endif
@@ -90,9 +88,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 4612321c73cc..3d733ba16f28 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -245,7 +245,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
int ret;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (regs->depc > 64)
panic("rt_sigreturn in double exception!\n");
diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c
index 57078e3ea9b7..4f3f888d33bb 100644
--- a/drivers/acpi/acpica/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -111,8 +111,8 @@ void acpi_ut_track_stack_ptr(void)
* RETURN: Updated pointer to the function name
*
* DESCRIPTION: Remove the "Acpi" prefix from the function name, if present.
- * This allows compiler macros such as __FUNCTION__ to be used
- * with no change to the debug output.
+ * This allows compiler macros such as __func__ to be used with no
+ * change to the debug output.
*
******************************************************************************/
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 7f66d2e08f19..37779e4c4585 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1391,7 +1391,7 @@ static int blkfront_probe(struct xenbus_device *dev,
if (major != XENVBD_MAJOR) {
printk(KERN_INFO
"%s: HVM does not support vbd %d as xen block device\n",
- __FUNCTION__, vdevice);
+ __func__, vdevice);
return -ENODEV;
}
}
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index bd8bda386e02..8e233edd7a09 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -53,9 +53,9 @@ static ssize_t name##_show(struct device *d, \
} \
static DEVICE_ATTR_RO(name);
-static inline int init_done(struct zram *zram)
+static inline bool init_done(struct zram *zram)
{
- return zram->meta != NULL;
+ return zram->disksize;
}
static inline struct zram *dev_to_zram(struct device *dev)
@@ -307,42 +307,67 @@ static inline int valid_io_request(struct zram *zram,
return 1;
}
-static void zram_meta_free(struct zram_meta *meta)
+static void zram_meta_free(struct zram_meta *meta, u64 disksize)
{
+ size_t num_pages = disksize >> PAGE_SHIFT;
+ size_t index;
+
+ /* Free all pages that are still in this zram device */
+ for (index = 0; index < num_pages; index++) {
+ unsigned long handle = meta->table[index].handle;
+
+ if (!handle)
+ continue;
+
+ zs_free(meta->mem_pool, handle);
+ }
+
zs_destroy_pool(meta->mem_pool);
vfree(meta->table);
kfree(meta);
}
-static struct zram_meta *zram_meta_alloc(u64 disksize)
+static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
{
size_t num_pages;
+ char pool_name[8];
struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+
if (!meta)
- goto out;
+ return NULL;
num_pages = disksize >> PAGE_SHIFT;
meta->table = vzalloc(num_pages * sizeof(*meta->table));
if (!meta->table) {
pr_err("Error allocating zram address table\n");
- goto free_meta;
+ goto out_error;
}
- meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+ snprintf(pool_name, sizeof(pool_name), "zram%d", device_id);
+ meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
- goto free_table;
+ goto out_error;
}
return meta;
-free_table:
+out_error:
vfree(meta->table);
-free_meta:
kfree(meta);
- meta = NULL;
-out:
- return meta;
+ return NULL;
+}
+
+static inline bool zram_meta_get(struct zram *zram)
+{
+ if (atomic_inc_not_zero(&zram->refcount))
+ return true;
+ return false;
+}
+
+static inline void zram_meta_put(struct zram *zram)
+{
+ atomic_dec(&zram->refcount);
}
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
@@ -704,10 +729,11 @@ static void zram_bio_discard(struct zram *zram, u32 index,
}
}
-static void zram_reset_device(struct zram *zram, bool reset_capacity)
+static void zram_reset_device(struct zram *zram)
{
- size_t index;
struct zram_meta *meta;
+ struct zcomp *comp;
+ u64 disksize;
down_write(&zram->init_lock);
@@ -719,36 +745,30 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
}
meta = zram->meta;
- /* Free all pages that are still in this zram device */
- for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
- unsigned long handle = meta->table[index].handle;
- if (!handle)
- continue;
-
- zs_free(meta->mem_pool, handle);
- }
-
- zcomp_destroy(zram->comp);
- zram->max_comp_streams = 1;
+ comp = zram->comp;
+ disksize = zram->disksize;
+ /*
+ * Refcount will go down to 0 eventually and r/w handler
+ * cannot handle further I/O so it will bail out by
+ * check zram_meta_get.
+ */
+ zram_meta_put(zram);
+ /*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks.
+ */
+ wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
- zram_meta_free(zram->meta);
- zram->meta = NULL;
/* Reset stats */
memset(&zram->stats, 0, sizeof(zram->stats));
-
zram->disksize = 0;
- if (reset_capacity)
- set_capacity(zram->disk, 0);
+ zram->max_comp_streams = 1;
+ set_capacity(zram->disk, 0);
up_write(&zram->init_lock);
-
- /*
- * Revalidate disk out of the init_lock to avoid lockdep splat.
- * It's okay because disk's capacity is protected by init_lock
- * so that revalidate_disk always sees up-to-date capacity.
- */
- if (reset_capacity)
- revalidate_disk(zram->disk);
+ /* I/O operation under all of CPU are done so let's free */
+ zram_meta_free(meta, disksize);
+ zcomp_destroy(comp);
}
static ssize_t disksize_store(struct device *dev,
@@ -765,7 +785,7 @@ static ssize_t disksize_store(struct device *dev,
return -EINVAL;
disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(disksize);
+ meta = zram_meta_alloc(zram->disk->first_minor, disksize);
if (!meta)
return -ENOMEM;
@@ -784,6 +804,8 @@ static ssize_t disksize_store(struct device *dev,
goto out_destroy_comp;
}
+ init_waitqueue_head(&zram->io_done);
+ atomic_set(&zram->refcount, 1);
zram->meta = meta;
zram->comp = comp;
zram->disksize = disksize;
@@ -803,7 +825,7 @@ out_destroy_comp:
up_write(&zram->init_lock);
zcomp_destroy(comp);
out_free_meta:
- zram_meta_free(meta);
+ zram_meta_free(meta, disksize);
return err;
}
@@ -821,8 +843,9 @@ static ssize_t reset_store(struct device *dev,
if (!bdev)
return -ENOMEM;
+ mutex_lock(&bdev->bd_mutex);
/* Do not reset an active device! */
- if (bdev->bd_holders) {
+ if (bdev->bd_openers) {
ret = -EBUSY;
goto out;
}
@@ -838,12 +861,16 @@ static ssize_t reset_store(struct device *dev,
/* Make sure all pending I/O is finished */
fsync_bdev(bdev);
+ zram_reset_device(zram);
+
+ mutex_unlock(&bdev->bd_mutex);
+ revalidate_disk(zram->disk);
bdput(bdev);
- zram_reset_device(zram, true);
return len;
out:
+ mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
return ret;
}
@@ -909,23 +936,21 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
{
struct zram *zram = queue->queuedata;
- down_read(&zram->init_lock);
- if (unlikely(!init_done(zram)))
+ if (unlikely(!zram_meta_get(zram)))
goto error;
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size)) {
atomic64_inc(&zram->stats.invalid_io);
- goto error;
+ goto put_zram;
}
__zram_make_request(zram, bio);
- up_read(&zram->init_lock);
-
+ zram_meta_put(zram);
return;
-
+put_zram:
+ zram_meta_put(zram);
error:
- up_read(&zram->init_lock);
bio_io_error(bio);
}
@@ -947,21 +972,19 @@ static void zram_slot_free_notify(struct block_device *bdev,
static int zram_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, int rw)
{
- int offset, err;
+ int offset, err = -EIO;
u32 index;
struct zram *zram;
struct bio_vec bv;
zram = bdev->bd_disk->private_data;
+ if (unlikely(!zram_meta_get(zram)))
+ goto out;
+
if (!valid_io_request(zram, sector, PAGE_SIZE)) {
atomic64_inc(&zram->stats.invalid_io);
- return -EINVAL;
- }
-
- down_read(&zram->init_lock);
- if (unlikely(!init_done(zram))) {
- err = -EIO;
- goto out_unlock;
+ err = -EINVAL;
+ goto put_zram;
}
index = sector >> SECTORS_PER_PAGE_SHIFT;
@@ -972,8 +995,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
bv.bv_offset = 0;
err = zram_bvec_rw(zram, &bv, index, offset, rw);
-out_unlock:
- up_read(&zram->init_lock);
+put_zram:
+ zram_meta_put(zram);
+out:
/*
* If I/O fails, just return error(ie, non-zero) without
* calling page_endio.
@@ -1039,19 +1063,19 @@ static struct attribute_group zram_disk_attr_group = {
static int create_device(struct zram *zram, int device_id)
{
+ struct request_queue *queue;
int ret = -ENOMEM;
init_rwsem(&zram->init_lock);
- zram->queue = blk_alloc_queue(GFP_KERNEL);
- if (!zram->queue) {
+ queue = blk_alloc_queue(GFP_KERNEL);
+ if (!queue) {
pr_err("Error allocating disk queue for device %d\n",
device_id);
goto out;
}
- blk_queue_make_request(zram->queue, zram_make_request);
- zram->queue->queuedata = zram;
+ blk_queue_make_request(queue, zram_make_request);
/* gendisk structure */
zram->disk = alloc_disk(1);
@@ -1064,7 +1088,8 @@ static int create_device(struct zram *zram, int device_id)
zram->disk->major = zram_major;
zram->disk->first_minor = device_id;
zram->disk->fops = &zram_devops;
- zram->disk->queue = zram->queue;
+ zram->disk->queue = queue;
+ zram->disk->queue->queuedata = zram;
zram->disk->private_data = zram;
snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
@@ -1115,20 +1140,35 @@ out_free_disk:
del_gendisk(zram->disk);
put_disk(zram->disk);
out_free_queue:
- blk_cleanup_queue(zram->queue);
+ blk_cleanup_queue(queue);
out:
return ret;
}
-static void destroy_device(struct zram *zram)
+static void destroy_devices(unsigned int nr)
{
- sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
- &zram_disk_attr_group);
+ struct zram *zram;
+ unsigned int i;
- del_gendisk(zram->disk);
- put_disk(zram->disk);
+ for (i = 0; i < nr; i++) {
+ zram = &zram_devices[i];
+ /*
+ * Remove sysfs first, so no one will perform a disksize
+ * store while we destroy the devices
+ */
+ sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+ &zram_disk_attr_group);
+
+ zram_reset_device(zram);
- blk_cleanup_queue(zram->queue);
+ blk_cleanup_queue(zram->disk->queue);
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+ }
+
+ kfree(zram_devices);
+ unregister_blkdev(zram_major, "zram");
+ pr_info("Destroyed %u device(s)\n", nr);
}
static int __init zram_init(void)
@@ -1138,64 +1178,39 @@ static int __init zram_init(void)
if (num_devices > max_num_devices) {
pr_warn("Invalid value for num_devices: %u\n",
num_devices);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
pr_warn("Unable to get major number\n");
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
/* Allocate the device array and initialize each one */
zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
if (!zram_devices) {
- ret = -ENOMEM;
- goto unregister;
+ unregister_blkdev(zram_major, "zram");
+ return -ENOMEM;
}
for (dev_id = 0; dev_id < num_devices; dev_id++) {
ret = create_device(&zram_devices[dev_id], dev_id);
if (ret)
- goto free_devices;
+ goto out_error;
}
- pr_info("Created %u device(s) ...\n", num_devices);
-
+ pr_info("Created %u device(s)\n", num_devices);
return 0;
-free_devices:
- while (dev_id)
- destroy_device(&zram_devices[--dev_id]);
- kfree(zram_devices);
-unregister:
- unregister_blkdev(zram_major, "zram");
-out:
+out_error:
+ destroy_devices(dev_id);
return ret;
}
static void __exit zram_exit(void)
{
- int i;
- struct zram *zram;
-
- for (i = 0; i < num_devices; i++) {
- zram = &zram_devices[i];
-
- destroy_device(zram);
- /*
- * Shouldn't access zram->disk after destroy_device
- * because destroy_device already released zram->disk.
- */
- zram_reset_device(zram, false);
- }
-
- unregister_blkdev(zram_major, "zram");
-
- kfree(zram_devices);
- pr_debug("Cleanup done!\n");
+ destroy_devices(num_devices);
}
module_init(zram_init);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index b05a816b09ac..17056e589146 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -100,24 +100,25 @@ struct zram_meta {
struct zram {
struct zram_meta *meta;
- struct request_queue *queue;
- struct gendisk *disk;
struct zcomp *comp;
-
- /* Prevent concurrent execution of device init, reset and R/W request */
+ struct gendisk *disk;
+ /* Prevent concurrent execution of device init */
struct rw_semaphore init_lock;
/*
- * This is the limit on amount of *uncompressed* worth of data
- * we can store in a disk.
+ * the number of pages zram can consume for storing compressed data
*/
- u64 disksize; /* bytes */
+ unsigned long limit_pages;
int max_comp_streams;
+
struct zram_stats stats;
+ atomic_t refcount; /* refcount for zram_meta */
+ /* wait all IO under all of cpu are done */
+ wait_queue_head_t io_done;
/*
- * the number of pages zram can consume for storing compressed data
+ * This is the limit on amount of *uncompressed* worth of data
+ * we can store in a disk.
*/
- unsigned long limit_pages;
-
+ u64 disksize; /* bytes */
char compressor[10];
};
#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..d04be762b216 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -400,19 +400,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
* LRU lists entirely, while shrink_move moves it to the indicated
* private list.
*/
-static void d_lru_isolate(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
- list_del_init(&dentry->d_lru);
+ list_lru_isolate(lru, &dentry->d_lru);
}
-static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+ struct list_head *list)
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags |= DCACHE_SHRINK_LIST;
- list_move_tail(&dentry->d_lru, list);
+ list_lru_isolate_move(lru, &dentry->d_lru, list);
}
/*
@@ -869,8 +870,8 @@ static void shrink_dentry_list(struct list_head *list)
}
}
-static enum lru_status
-dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status dentry_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +891,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* another pass through the LRU.
*/
if (dentry->d_lockref.count) {
- d_lru_isolate(dentry);
+ d_lru_isolate(lru, dentry);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
}
@@ -921,7 +922,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
return LRU_ROTATE;
}
- d_lru_shrink_move(dentry, freeable);
+ d_lru_shrink_move(lru, dentry, freeable);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
@@ -930,30 +931,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
/**
* prune_dcache_sb - shrink the dcache
* @sb: superblock
- * @nr_to_scan : number of entries to try to free
- * @nid: which node to scan for freeable entities
+ * @sc: shrink control, passed to list_lru_shrink_walk()
*
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
- * done when we need more memory an called from the superblock shrinker
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+ * is done when we need more memory and called from the superblock shrinker
* function.
*
* This function may fail to free any resources if all the dentries are in
* use.
*/
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid)
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
LIST_HEAD(dispose);
long freed;
- freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
- &dispose, &nr_to_scan);
+ freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+ dentry_lru_isolate, &dispose);
shrink_dentry_list(&dispose);
return freed;
}
static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
- spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -966,7 +965,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
if (!spin_trylock(&dentry->d_lock))
return LRU_SKIP;
- d_lru_shrink_move(dentry, freeable);
+ d_lru_shrink_move(lru, dentry, freeable);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
iput(toput_inode);
}
-static void drop_slab(void)
-{
- int nr_objects;
-
- do {
- int nid;
-
- nr_objects = 0;
- for_each_online_node(nid)
- nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
- 1000, 1000);
- } while (nr_objects > 10);
-}
-
int drop_caches_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e193cb36996..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
}
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *dispose = arg;
struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
if (qd->qd_lockref.count == 0) {
lockref_mark_dead(&qd->qd_lockref);
- list_move(&qd->qd_lru, dispose);
+ list_lru_isolate_move(lru, &qd->qd_lru, dispose);
}
spin_unlock(&qd->qd_lockref.lock);
@@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
- freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
- &dispose, &sc->nr_to_scan);
+ freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
+ gfs2_qd_isolate, &dispose);
gfs2_qd_dispose(&dispose);
@@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
+ return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
}
struct shrinker gfs2_qd_shrinker = {
diff --git a/fs/inode.c b/fs/inode.c
index b7871577571d..86bfaca724db 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -672,8 +672,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
-static enum lru_status
-inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status inode_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct inode *inode = container_of(item, struct inode, i_lru);
@@ -691,7 +691,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
*/
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
- list_del_init(&inode->i_lru);
+ list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
@@ -725,7 +725,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
- list_move(&inode->i_lru, freeable);
+ list_lru_isolate_move(lru, &inode->i_lru, freeable);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
@@ -738,14 +738,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* to trim from the LRU. Inodes to be freed are moved to a temporary list and
* then are freed outside inode_lock by dispose_list().
*/
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid)
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
LIST_HEAD(freeable);
long freed;
- freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
- &freeable, &nr_to_scan);
+ freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
+ inode_lru_isolate, &freeable);
dispose_list(&freeable);
return freed;
}
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..d92c346a793d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
struct linux_binprm;
struct path;
struct mount;
+struct shrink_control;
/*
* block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
* inode.c
*/
extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid);
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern void inode_add_lru(struct inode *inode);
/*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
extern int d_set_mounted(struct dentry *dentry);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid);
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
/*
* read_write.c
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bd117d065b82..a3ccf4c4ce70 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <asm/pgtable.h>
@@ -89,39 +90,18 @@
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
- int i;
- char *buf, *end;
- char *name;
+ char *buf;
char tcomm[sizeof(p->comm)];
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- end = m->buf + m->size;
buf = m->buf + m->count;
- name = tcomm;
- i = sizeof(tcomm);
- while (i && (buf < end)) {
- unsigned char c = *name;
- name++;
- i--;
- *buf = c;
- if (!c)
- break;
- if (c == '\\') {
- buf++;
- if (buf < end)
- *buf++ = c;
- continue;
- }
- if (c == '\n') {
- *buf++ = '\\';
- if (buf < end)
- *buf++ = 'n';
- continue;
- }
- buf++;
- }
+
+ /* Ignore error for now */
+ string_escape_str(tcomm, &buf, m->size - m->count,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+
m->count = buf - m->buf;
seq_putc(m, '\n');
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7fea13229f33..de14e46fd807 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -122,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
- struct proc_dir_entry *de = PROC_I(inode)->pde;
+ struct proc_dir_entry *de = PDE(inode);
if (de && de->nlink)
set_nlink(inode, de->nlink);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8420a2f80811..13a50a32652d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
put_pid(PROC_I(inode)->pid);
/* Let go of any associated proc directory entry */
- de = PROC_I(inode)->pde;
+ de = PDE(inode);
if (de)
pde_put(de);
head = PROC_I(inode)->sysctl;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0e36c1e49fe3..956b75d61809 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -732,6 +732,7 @@ enum clear_refs_types {
CLEAR_REFS_ANON,
CLEAR_REFS_MAPPED,
CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_MM_HIWATER_RSS,
CLEAR_REFS_LAST,
};
@@ -907,6 +908,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
.mm = mm,
.private = &cp,
};
+
+ if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ /*
+ * Writing 5 to /proc/pid/clear_refs resets the peak
+ * resident set size to this mm's current rss value.
+ */
+ down_write(&mm->mmap_sem);
+ reset_mm_hiwater_rss(mm);
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
+
down_read(&mm->mmap_sem);
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -928,6 +941,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
+out_mm:
mmput(mm);
}
put_task_struct(task);
@@ -1543,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
for_each_node_state(nid, N_MEMORY)
if (md->node[nid])
seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+
+ seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
out:
seq_putc(m, '\n');
m_cache_vma(m, vma);
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
if (ret == -EINTR) {
struct restart_block *restart_block;
- restart_block = &current_thread_info()->restart_block;
+ restart_block = &current->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
diff --git a/fs/super.c b/fs/super.c
index 05a021638b11..1facd2c282e5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -75,10 +75,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
return SHRINK_STOP;
if (sb->s_op->nr_cached_objects)
- fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
+ fs_objects = sb->s_op->nr_cached_objects(sb, sc);
- inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
- dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
+ inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+ dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects = dentries + inodes + fs_objects + 1;
if (!total_objects)
total_objects = 1;
@@ -86,19 +86,23 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
/* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+ fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
/*
* prune the dcache first as the icache is pinned by it, then
* prune the icache, followed by the filesystem specific caches
+ *
+ * Ensure that we always scan at least one object - memcg kmem
+ * accounting uses this to fully empty the caches.
*/
- freed = prune_dcache_sb(sb, dentries, sc->nid);
- freed += prune_icache_sb(sb, inodes, sc->nid);
+ sc->nr_to_scan = dentries + 1;
+ freed = prune_dcache_sb(sb, sc);
+ sc->nr_to_scan = inodes + 1;
+ freed += prune_icache_sb(sb, sc);
if (fs_objects) {
- fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
- total_objects);
- freed += sb->s_op->free_cached_objects(sb, fs_objects,
- sc->nid);
+ sc->nr_to_scan = fs_objects + 1;
+ freed += sb->s_op->free_cached_objects(sb, sc);
}
drop_super(sb);
@@ -118,17 +122,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
* scalability bottleneck. The counts could get updated
* between super_cache_count and super_cache_scan anyway.
* Call to super_cache_count with shrinker_rwsem held
- * ensures the safety of call to list_lru_count_node() and
+ * ensures the safety of call to list_lru_shrink_count() and
* s_op->nr_cached_objects().
*/
if (sb->s_op && sb->s_op->nr_cached_objects)
- total_objects = sb->s_op->nr_cached_objects(sb,
- sc->nid);
+ total_objects = sb->s_op->nr_cached_objects(sb, sc);
- total_objects += list_lru_count_node(&sb->s_dentry_lru,
- sc->nid);
- total_objects += list_lru_count_node(&sb->s_inode_lru,
- sc->nid);
+ total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+ total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
total_objects = vfs_pressure_ratio(total_objects);
return total_objects;
@@ -191,9 +192,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
- if (list_lru_init(&s->s_dentry_lru))
+ if (list_lru_init_memcg(&s->s_dentry_lru))
goto fail;
- if (list_lru_init(&s->s_inode_lru))
+ if (list_lru_init_memcg(&s->s_inode_lru))
goto fail;
init_rwsem(&s->s_umount);
@@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_shrink.scan_objects = super_cache_scan;
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
- s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+ s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
return s;
fail:
@@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s)
unregister_shrinker(&s->s_shrink);
fs->kill_sb(s);
+ /*
+ * Since list_lru_destroy() may sleep, we cannot call it from
+ * put_super(), where we hold the sb_lock. Therefore we destroy
+ * the lru lists right now.
+ */
+ list_lru_destroy(&s->s_dentry_lru);
+ list_lru_destroy(&s->s_inode_lru);
+
put_filesystem(fs);
put_super(s);
} else {
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
static enum lru_status
xfs_buftarg_wait_rele(
struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
*/
atomic_set(&bp->b_lru_ref, 0);
bp->b_state |= XFS_BSTATE_DISPOSE;
- list_move(item, dispose);
+ list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
}
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
static enum lru_status
xfs_buftarg_isolate(
struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
{
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
}
bp->b_state |= XFS_BSTATE_DISPOSE;
- list_move(item, dispose);
+ list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
}
@@ -1583,10 +1585,9 @@ xfs_buftarg_shrink_scan(
struct xfs_buftarg, bt_shrinker);
LIST_HEAD(dispose);
unsigned long freed;
- unsigned long nr_to_scan = sc->nr_to_scan;
- freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
- &dispose, &nr_to_scan);
+ freed = list_lru_shrink_walk(&btp->bt_lru, sc,
+ xfs_buftarg_isolate, &dispose);
while (!list_empty(&dispose)) {
struct xfs_buf *bp;
@@ -1605,7 +1606,7 @@ xfs_buftarg_shrink_count(
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
- return list_lru_count_node(&btp->bt_lru, sc->nid);
+ return list_lru_shrink_count(&btp->bt_lru, sc);
}
void
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e8186279541..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
static enum lru_status
xfs_qm_dquot_isolate(
struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
__releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
XFS_STATS_INC(xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
- list_del_init(&dqp->q_lru);
+ list_lru_isolate(lru, &dqp->q_lru);
XFS_STATS_DEC(xs_qm_dquot_unused);
return LRU_REMOVED;
}
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
xfs_dqunlock(dqp);
ASSERT(dqp->q_nrefs == 0);
- list_move_tail(&dqp->q_lru, &isol->dispose);
+ list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
XFS_STATS_DEC(xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
XFS_STATS_INC(xs_qm_dqreclaims);
@@ -523,7 +524,6 @@ xfs_qm_shrink_scan(
struct xfs_qm_isolate isol;
unsigned long freed;
int error;
- unsigned long nr_to_scan = sc->nr_to_scan;
if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0;
@@ -531,8 +531,8 @@ xfs_qm_shrink_scan(
INIT_LIST_HEAD(&isol.buffers);
INIT_LIST_HEAD(&isol.dispose);
- freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
- &nr_to_scan);
+ freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+ xfs_qm_dquot_isolate, &isol);
error = xfs_buf_delwri_submit(&isol.buffers);
if (error)
@@ -557,7 +557,7 @@ xfs_qm_shrink_count(
struct xfs_quotainfo *qi = container_of(shrink,
struct xfs_quotainfo, qi_shrinker);
- return list_lru_count_node(&qi->qi_lru, sc->nid);
+ return list_lru_shrink_count(&qi->qi_lru, sc);
}
/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f2449fd86926..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1537,7 +1537,7 @@ xfs_fs_mount(
static long
xfs_fs_nr_cached_objects(
struct super_block *sb,
- int nid)
+ struct shrink_control *sc)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
@@ -1545,10 +1545,9 @@ xfs_fs_nr_cached_objects(
static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- long nr_to_scan,
- int nid)
+ struct shrink_control *sc)
{
- return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+ return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
}
static const struct super_operations xfs_super_operations = {
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index 9318a87ee39a..a8f344363e77 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -240,7 +240,7 @@
/*
* If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header,
* define it now. This is the case where there the compiler does not support
- * a __FUNCTION__ macro or equivalent.
+ * a __func__ macro or equivalent.
*/
#ifndef ACPI_GET_FUNCTION_NAME
#define ACPI_GET_FUNCTION_NAME _acpi_function_name
@@ -249,12 +249,12 @@
* The Name parameter should be the procedure name as a quoted string.
* The function name is also used by the function exit macros below.
* Note: (const char) is used to be compatible with the debug interfaces
- * and macros such as __FUNCTION__.
+ * and macros such as __func__.
*/
#define ACPI_FUNCTION_NAME(name) static const char _acpi_function_name[] = #name;
#else
-/* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */
+/* Compiler supports __func__ (or equivalent) -- Ignore this macro */
#define ACPI_FUNCTION_NAME(name)
#endif /* ACPI_GET_FUNCTION_NAME */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 129de9204d18..4d46085c1b90 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -244,10 +244,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
# define pte_accessible(mm, pte) ((void)(pte), 1)
#endif
-#ifndef pte_present_nonuma
-#define pte_present_nonuma(pte) pte_present(pte)
-#endif
-
#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif
@@ -673,155 +669,24 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#endif
}
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that
- * is protected for PROT_NONE and a NUMA hinting fault entry. If the
- * architecture defines __PAGE_PROTNONE then it should take that into account
- * but those that do not can rely on the fact that the NUMA hinting scanner
- * skips inaccessible VMAs.
- *
- * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
- * fault triggers on those regions if pte/pmd_numa returns true
- * (because _PAGE_PRESENT is not set).
- */
-#ifndef pte_numa
-static inline int pte_numa(pte_t pte)
-{
- return ptenuma_flags(pte) == _PAGE_NUMA;
-}
-#endif
-
-#ifndef pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
- return pmdnuma_flags(pmd) == _PAGE_NUMA;
-}
-#endif
-
+#ifndef CONFIG_NUMA_BALANCING
/*
- * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
- * because they're called by the NUMA hinting minor page fault. If we
- * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
- * would be forced to set it later while filling the TLB after we
- * return to userland. That would trigger a second write to memory
- * that we optimize away by setting _PAGE_ACCESSED here.
+ * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
+ * the only case the kernel cares is for NUMA balancing and is only ever set
+ * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked
+ * _PAGE_PROTNONE so by by default, implement the helper as "always no". It
+ * is the responsibility of the caller to distinguish between PROT_NONE
+ * protections and NUMA hinting fault protections.
*/
-#ifndef pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
- pteval_t val = pte_val(pte);
-
- val &= ~_PAGE_NUMA;
- val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
- return __pte(val);
-}
-#endif
-
-#ifndef pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
- pmdval_t val = pmd_val(pmd);
-
- val &= ~_PAGE_NUMA;
- val |= (_PAGE_PRESENT|_PAGE_ACCESSED);
-
- return __pmd(val);
-}
-#endif
-
-#ifndef pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
- pteval_t val = pte_val(pte);
-
- VM_BUG_ON(!(val & _PAGE_PRESENT));
-
- val &= ~_PAGE_PRESENT;
- val |= _PAGE_NUMA;
-
- return __pte(val);
-}
-#endif
-
-#ifndef ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- pte_t ptent = *ptep;
-
- ptent = pte_mknuma(ptent);
- set_pte_at(mm, addr, ptep, ptent);
- return;
-}
-#endif
-
-#ifndef pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
- pmdval_t val = pmd_val(pmd);
-
- val &= ~_PAGE_PRESENT;
- val |= _PAGE_NUMA;
-
- return __pmd(val);
-}
-#endif
-
-#ifndef pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- pmd_t pmd = *pmdp;
-
- pmd = pmd_mknuma(pmd);
- set_pmd_at(mm, addr, pmdp, pmd);
- return;
-}
-#endif
-#else
-static inline int pmd_numa(pmd_t pmd)
+static inline int pte_protnone(pte_t pte)
{
return 0;
}
-static inline int pte_numa(pte_t pte)
+static inline int pmd_protnone(pmd_t pmd)
{
return 0;
}
-
-static inline pte_t pte_mknonnuma(pte_t pte)
-{
- return pte;
-}
-
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
- return pmd;
-}
-
-static inline pte_t pte_mknuma(pte_t pte)
-{
- return pte;
-}
-
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- return;
-}
-
-
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
- return pmd;
-}
-
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- return ;
-}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_MMU */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 202e4034fe26..5f5c00de39f0 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -160,18 +160,18 @@ extern int bitmap_parselist(const char *buf, unsigned long *maskp,
extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
- const unsigned long *old, const unsigned long *new, int bits);
+ const unsigned long *old, const unsigned long *new, unsigned int nbits);
extern int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
- const unsigned long *relmap, int bits);
+ const unsigned long *relmap, unsigned int bits);
extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
- int sz, int bits);
+ unsigned int sz, unsigned int nbits);
extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
-extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
+extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
extern int bitmap_print_to_pagebuf(bool list, char *buf,
const unsigned long *maskp, int nmaskbits);
@@ -185,33 +185,33 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf,
#define small_const_nbits(nbits) \
(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
-static inline void bitmap_zero(unsigned long *dst, int nbits)
+static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = 0UL;
else {
- int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+ unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memset(dst, 0, len);
}
}
-static inline void bitmap_fill(unsigned long *dst, int nbits)
+static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
- size_t nlongs = BITS_TO_LONGS(nbits);
+ unsigned int nlongs = BITS_TO_LONGS(nbits);
if (!small_const_nbits(nbits)) {
- int len = (nlongs - 1) * sizeof(unsigned long);
+ unsigned int len = (nlongs - 1) * sizeof(unsigned long);
memset(dst, 0xff, len);
}
dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
}
static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
- int nbits)
+ unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = *src;
else {
- int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+ unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memcpy(dst, src, len);
}
}
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b950e9d6008b..ff9044286d88 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -905,13 +905,13 @@ static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
}
#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
-static inline void __cpus_setall(cpumask_t *dstp, int nbits)
+static inline void __cpus_setall(cpumask_t *dstp, unsigned int nbits)
{
bitmap_fill(dstp->bits, nbits);
}
#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
-static inline void __cpus_clear(cpumask_t *dstp, int nbits)
+static inline void __cpus_clear(cpumask_t *dstp, unsigned int nbits)
{
bitmap_zero(dstp->bits, nbits);
}
@@ -927,21 +927,21 @@ static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
- const cpumask_t *src2p, int nbits)
+ const cpumask_t *src2p, unsigned int nbits)
{
return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}
#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
- const cpumask_t *src2p, int nbits)
+ const cpumask_t *src2p, unsigned int nbits)
{
bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}
#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
- const cpumask_t *src2p, int nbits)
+ const cpumask_t *src2p, unsigned int nbits)
{
bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}
@@ -949,40 +949,40 @@ static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
#define cpus_andnot(dst, src1, src2) \
__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
- const cpumask_t *src2p, int nbits)
+ const cpumask_t *src2p, unsigned int nbits)
{
return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}
#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
static inline int __cpus_equal(const cpumask_t *src1p,
- const cpumask_t *src2p, int nbits)
+ const cpumask_t *src2p, unsigned int nbits)
{
return bitmap_equal(src1p->bits, src2p->bits, nbits);
}
#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
static inline int __cpus_intersects(const cpumask_t *src1p,
- const cpumask_t *src2p, int nbits)
+ const cpumask_t *src2p, unsigned int nbits)
{
return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}
#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
static inline int __cpus_subset(const cpumask_t *src1p,
- const cpumask_t *src2p, int nbits)
+ const cpumask_t *src2p, unsigned int nbits)
{
return bitmap_subset(src1p->bits, src2p->bits, nbits);
}
#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
-static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
+static inline int __cpus_empty(const cpumask_t *srcp, unsigned int nbits)
{
return bitmap_empty(srcp->bits, nbits);
}
#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
-static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
+static inline int __cpus_weight(const cpumask_t *srcp, unsigned int nbits)
{
return bitmap_weight(srcp->bits, nbits);
}
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 2cd9f1cf9fa3..f4754282c9c2 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -1,6 +1,8 @@
#ifndef __CRYPTOHASH_H
#define __CRYPTOHASH_H
+#include <uapi/linux/types.h>
+
#define SHA_DIGEST_WORDS 5
#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
#define SHA_WORKSPACE_WORDS 16
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec0f1dc66b9b..e49f10cc8a73 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1655,8 +1655,10 @@ struct super_operations {
struct dquot **(*get_dquots)(struct inode *);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
- long (*nr_cached_objects)(struct super_block *, int);
- long (*free_cached_objects)(struct super_block *, long, int);
+ long (*nr_cached_objects)(struct super_block *,
+ struct shrink_control *);
+ long (*free_cached_objects)(struct super_block *,
+ struct shrink_control *);
};
/*
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3037fc085e8e..d3d43ecf148c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -193,6 +193,9 @@ extern struct task_group root_task_group;
.nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &init_mm, \
+ .restart_block = { \
+ .fn = do_no_restart_syscall, \
+ }, \
.se = { \
.group_node = LIST_HEAD_INIT(tsk.se.group_node), \
}, \
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e42e7dc34c68..d6d630d31ef3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -800,9 +800,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
-/* Trap pasters of __FUNCTION__ at compile-time */
-#define __FUNCTION__ (__func__)
-
/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f3434533fbf8..2a6b9947aaa3 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -9,6 +9,9 @@
#include <linux/list.h>
#include <linux/nodemask.h>
+#include <linux/shrinker.h>
+
+struct mem_cgroup;
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
@@ -21,24 +24,45 @@ enum lru_status {
internally, but has to return locked. */
};
-struct list_lru_node {
- spinlock_t lock;
+struct list_lru_one {
struct list_head list;
- /* kept as signed so we can catch imbalance bugs */
+ /* may become negative during memcg reparenting */
long nr_items;
+};
+
+struct list_lru_memcg {
+ /* array of per cgroup lists, indexed by memcg_cache_id */
+ struct list_lru_one *lru[0];
+};
+
+struct list_lru_node {
+ /* protects all lists on the node, including per cgroup */
+ spinlock_t lock;
+ /* global list, used for the root cgroup in cgroup aware lrus */
+ struct list_lru_one lru;
+#ifdef CONFIG_MEMCG_KMEM
+ /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
+ struct list_lru_memcg *memcg_lrus;
+#endif
} ____cacheline_aligned_in_smp;
struct list_lru {
struct list_lru_node *node;
- nodemask_t active_nodes;
+#ifdef CONFIG_MEMCG_KMEM
+ struct list_head list;
+#endif
};
void list_lru_destroy(struct list_lru *lru);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
-static inline int list_lru_init(struct list_lru *lru)
-{
- return list_lru_init_key(lru, NULL);
-}
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+ struct lock_class_key *key);
+
+#define list_lru_init(lru) __list_lru_init((lru), false, NULL)
+#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key))
+#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL)
+
+int memcg_update_all_list_lrus(int num_memcgs);
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx);
/**
* list_lru_add: add an element to the lru list's tail
@@ -72,32 +96,48 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
bool list_lru_del(struct list_lru *lru, struct list_head *item);
/**
- * list_lru_count_node: return the number of objects currently held by @lru
+ * list_lru_count_one: return the number of objects currently held by @lru
* @lru: the lru pointer.
* @nid: the node id to count from.
+ * @memcg: the cgroup to count from.
*
* Always return a non-negative number, 0 for empty lists. There is no
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg);
unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+ struct shrink_control *sc)
+{
+ return list_lru_count_one(lru, sc->nid, sc->memcg);
+}
+
static inline unsigned long list_lru_count(struct list_lru *lru)
{
long count = 0;
int nid;
- for_each_node_mask(nid, lru->active_nodes)
+ for_each_node_state(nid, N_NORMAL_MEMORY)
count += list_lru_count_node(lru, nid);
return count;
}
-typedef enum lru_status
-(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+ struct list_head *head);
+
+typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
+ struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
+
/**
- * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
* @isolate: callback function that is resposible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
@@ -115,18 +155,30 @@ typedef enum lru_status
*
* Return value: the number of objects effectively removed from the LRU.
*/
+unsigned long list_lru_walk_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+ list_lru_walk_cb isolate, void *cb_arg)
+{
+ return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
+ &sc->nr_to_scan);
+}
+
+static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
void *cb_arg, unsigned long nr_to_walk)
{
long isolated = 0;
int nid;
- for_each_node_mask(nid, lru->active_nodes) {
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
isolated += list_lru_walk_node(lru, nid, isolate,
cb_arg, &nr_to_walk);
if (nr_to_walk <= 0)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6cfd934c7c9b..72dff5fb0d0c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -398,7 +398,9 @@ static inline void sock_release_memcg(struct sock *sk)
#ifdef CONFIG_MEMCG_KMEM
extern struct static_key memcg_kmem_enabled_key;
-extern int memcg_limited_groups_array_size;
+extern int memcg_nr_cache_ids;
+extern void memcg_get_cache_ids(void);
+extern void memcg_put_cache_ids(void);
/*
* Helper macro to loop through all memcg-specific caches. Callers must still
@@ -406,13 +408,15 @@ extern int memcg_limited_groups_array_size;
* the slab_mutex must be held when looping through those caches
*/
#define for_each_memcg_cache_index(_idx) \
- for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
+ for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
static inline bool memcg_kmem_enabled(void)
{
return static_key_false(&memcg_kmem_enabled_key);
}
+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+
/*
* In general, we'll do everything in our power to not incur in any overhead
* for non-memcg users for the kmem functions. Not even a function call, if we
@@ -432,11 +436,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order);
int memcg_cache_id(struct mem_cgroup *memcg);
-void memcg_update_array_size(int num_groups);
-
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
+
int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
unsigned long nr_pages);
void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
@@ -533,6 +537,13 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
if (memcg_kmem_enabled())
__memcg_kmem_put_cache(cachep);
}
+
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+ if (!memcg_kmem_enabled())
+ return NULL;
+ return __mem_cgroup_from_kmem(ptr);
+}
#else
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
@@ -542,6 +553,11 @@ static inline bool memcg_kmem_enabled(void)
return false;
}
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return false;
+}
+
static inline bool
memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
{
@@ -562,6 +578,14 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return -1;
}
+static inline void memcg_get_cache_ids(void)
+{
+}
+
+static inline void memcg_put_cache_ids(void)
+{
+}
+
static inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
@@ -571,6 +595,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
}
+
+static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+ return NULL;
+}
#endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fab9b32ace8e..78baed5f2952 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -67,7 +67,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#ifdef CONFIG_NUMA_BALANCING
extern bool pmd_trans_migrating(pmd_t pmd);
-extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd);
extern int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node);
extern bool migrate_ratelimited(int node);
@@ -76,9 +75,6 @@ static inline bool pmd_trans_migrating(pmd_t pmd)
{
return false;
}
-static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
-}
static inline int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a4d24f3c5430..9bee7ec0c31f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1408,6 +1408,11 @@ static inline void update_hiwater_vm(struct mm_struct *mm)
mm->hiwater_vm = mm->total_vm;
}
+static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
+{
+ mm->hiwater_rss = get_mm_rss(mm);
+}
+
static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
struct mm_struct *mm)
{
@@ -1447,13 +1452,15 @@ static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif
-#ifdef __PAGETABLE_PMD_FOLDED
+#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
unsigned long address)
{
return 0;
}
+static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
+
static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
{
return 0;
@@ -1465,6 +1472,11 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+static inline void mm_nr_pmds_init(struct mm_struct *mm)
+{
+ atomic_long_set(&mm->nr_pmds, 0);
+}
+
static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
{
return atomic_long_read(&mm->nr_pmds);
@@ -2168,9 +2180,8 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
#endif
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
- unsigned long nr_scanned,
- unsigned long nr_eligible);
+void drop_slab(void);
+void drop_slab_node(int nid);
#ifndef CONFIG_MMU
#define randomize_va_space 0
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 83a6aeda899d..21cef483dc1b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -120,13 +120,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp)
}
#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
-static inline void __nodes_setall(nodemask_t *dstp, int nbits)
+static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
bitmap_fill(dstp->bits, nbits);
}
#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
-static inline void __nodes_clear(nodemask_t *dstp, int nbits)
+static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
bitmap_zero(dstp->bits, nbits);
}
@@ -144,7 +144,7 @@ static inline int __node_test_and_set(int node, nodemask_t *addr)
#define nodes_and(dst, src1, src2) \
__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
- const nodemask_t *src2p, int nbits)
+ const nodemask_t *src2p, unsigned int nbits)
{
bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}
@@ -152,7 +152,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
#define nodes_or(dst, src1, src2) \
__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
- const nodemask_t *src2p, int nbits)
+ const nodemask_t *src2p, unsigned int nbits)
{
bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}
@@ -160,7 +160,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
#define nodes_xor(dst, src1, src2) \
__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
- const nodemask_t *src2p, int nbits)
+ const nodemask_t *src2p, unsigned int nbits)
{
bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}
@@ -168,7 +168,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
#define nodes_andnot(dst, src1, src2) \
__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
- const nodemask_t *src2p, int nbits)
+ const nodemask_t *src2p, unsigned int nbits)
{
bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}
@@ -176,7 +176,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
#define nodes_complement(dst, src) \
__nodes_complement(&(dst), &(src), MAX_NUMNODES)
static inline void __nodes_complement(nodemask_t *dstp,
- const nodemask_t *srcp, int nbits)
+ const nodemask_t *srcp, unsigned int nbits)
{
bitmap_complement(dstp->bits, srcp->bits, nbits);
}
@@ -184,7 +184,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
#define nodes_equal(src1, src2) \
__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static inline int __nodes_equal(const nodemask_t *src1p,
- const nodemask_t *src2p, int nbits)
+ const nodemask_t *src2p, unsigned int nbits)
{
return bitmap_equal(src1p->bits, src2p->bits, nbits);
}
@@ -192,7 +192,7 @@ static inline int __nodes_equal(const nodemask_t *src1p,
#define nodes_intersects(src1, src2) \
__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static inline int __nodes_intersects(const nodemask_t *src1p,
- const nodemask_t *src2p, int nbits)
+ const nodemask_t *src2p, unsigned int nbits)
{
return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}
@@ -200,25 +200,25 @@ static inline int __nodes_intersects(const nodemask_t *src1p,
#define nodes_subset(src1, src2) \
__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static inline int __nodes_subset(const nodemask_t *src1p,
- const nodemask_t *src2p, int nbits)
+ const nodemask_t *src2p, unsigned int nbits)
{
return bitmap_subset(src1p->bits, src2p->bits, nbits);
}
#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
-static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
+static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
return bitmap_empty(srcp->bits, nbits);
}
#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
-static inline int __nodes_full(const nodemask_t *srcp, int nbits)
+static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
return bitmap_full(srcp->bits, nbits);
}
#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
-static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
+static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
return bitmap_weight(srcp->bits, nbits);
}
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4d5bf5726578..baa3f97d8ce8 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -417,9 +417,9 @@ enum {
DUMP_PREFIX_ADDRESS,
DUMP_PREFIX_OFFSET
};
-extern void hex_dump_to_buffer(const void *buf, size_t len,
- int rowsize, int groupsize,
- char *linebuf, size_t linebuflen, bool ascii);
+extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
+ int groupsize, char *linebuf, size_t linebuflen,
+ bool ascii);
#ifdef CONFIG_PRINTK
extern void print_hex_dump(const char *level, const char *prefix_str,
int prefix_type, int rowsize, int groupsize,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..048b91b983ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1370,6 +1370,8 @@ struct task_struct {
unsigned long atomic_flags; /* Flags needing atomic access. */
+ struct restart_block restart_block;
+
pid_t pid;
pid_t tgid;
@@ -2145,6 +2147,7 @@ extern unsigned long long notrace sched_clock(void);
*/
extern u64 cpu_clock(int cpu);
extern u64 local_clock(void);
+extern u64 running_clock(void);
extern u64 sched_clock_cpu(int cpu);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index f4aee75f00b1..4fcacd915d45 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -20,6 +20,9 @@ struct shrink_control {
/* current node being shrunk (for NUMA aware shrinkers) */
int nid;
+
+ /* current memcg being shrunk (for memcg aware shrinkers) */
+ struct mem_cgroup *memcg;
};
#define SHRINK_STOP (~0UL)
@@ -61,7 +64,8 @@ struct shrinker {
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
/* Flags */
-#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_MEMCG_AWARE (1 << 1)
extern int register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2e3b448cfa2d..ed2ffaab59ea 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -115,13 +115,12 @@ int slab_is_available(void);
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
unsigned long,
void (*)(void *));
-#ifdef CONFIG_MEMCG_KMEM
-void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
-#endif
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);
-void kmem_cache_free(struct kmem_cache *, void *);
+
+void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
+void memcg_deactivate_kmem_caches(struct mem_cgroup *);
+void memcg_destroy_kmem_caches(struct mem_cgroup *);
/*
* Please use this macro to create slab caches. Simply specify the
@@ -288,6 +287,7 @@ static __always_inline int kmalloc_index(size_t size)
void *__kmalloc(size_t size, gfp_t flags);
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *, void *);
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node);
@@ -473,14 +473,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif
+
+struct memcg_cache_array {
+ struct rcu_head rcu;
+ struct kmem_cache *entries[0];
+};
+
/*
* This is the main placeholder for memcg-related information in kmem caches.
- * struct kmem_cache will hold a pointer to it, so the memory cost while
- * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
- * would otherwise be if that would be bundled in kmem_cache: we'll need an
- * extra pointer chase. But the trade off clearly lays in favor of not
- * penalizing non-users.
- *
* Both the root cache and the child caches will have it. For the root cache,
* this will hold a dynamically allocated array large enough to hold
* information about the currently limited memcgs in the system. To allow the
@@ -491,14 +491,15 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
*
* @memcg: pointer to the memcg this cache belongs to
* @root_cache: pointer to the global, root cache, this cache was derived from
+ *
+ * Both root and child caches of the same kind are linked into a list chained
+ * through @list.
*/
struct memcg_cache_params {
bool is_root_cache;
+ struct list_head list;
union {
- struct {
- struct rcu_head rcu_head;
- struct kmem_cache *memcg_caches[0];
- };
+ struct memcg_cache_array __rcu *memcg_caches;
struct {
struct mem_cgroup *memcg;
struct kmem_cache *root_cache;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index b869d1662ba3..33d049066c3d 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -70,7 +70,7 @@ struct kmem_cache {
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
- struct memcg_cache_params *memcg_params;
+ struct memcg_cache_params memcg_params;
#endif
struct kmem_cache_node *node[MAX_NUMNODES];
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d82abd40a3c0..9abf04ed0999 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -85,7 +85,7 @@ struct kmem_cache {
struct kobject kobj; /* For sysfs */
#endif
#ifdef CONFIG_MEMCG_KMEM
- struct memcg_cache_params *memcg_params;
+ struct memcg_cache_params memcg_params;
int max_attr_size; /* for propagation, maximum size of a stored attr */
#ifdef CONFIG_SYSFS
struct kset *memcg_kset;
diff --git a/include/linux/string.h b/include/linux/string.h
index 2e22a2e58f3a..b9bc9a5d9e21 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -40,9 +40,6 @@ extern int strcmp(const char *,const char *);
#ifndef __HAVE_ARCH_STRNCMP
extern int strncmp(const char *,const char *,__kernel_size_t);
#endif
-#ifndef __HAVE_ARCH_STRNICMP
-#define strnicmp strncasecmp
-#endif
#ifndef __HAVE_ARCH_STRCASECMP
extern int strcasecmp(const char *s1, const char *s2);
#endif
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 6eb567ac56bc..657571817260 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -10,8 +10,8 @@ enum string_size_units {
STRING_UNITS_2, /* use binary powers of 2^10 */
};
-int string_get_size(u64 size, enum string_size_units units,
- char *buf, int len);
+void string_get_size(u64 size, enum string_size_units units,
+ char *buf, int len);
#define UNESCAPE_SPACE 0x01
#define UNESCAPE_OCTAL 0x02
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 831a3168ab35..cedf3d3c373f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
- return !pte_none(pte) && !pte_present_nonuma(pte);
+ return !pte_none(pte) && !pte_present(pte);
}
#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index 62323825cff9..6747247e3f9f 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -135,12 +135,9 @@ typedef unsigned long blkcnt_t;
#endif
/*
- * The type of an index into the pagecache. Use a #define so asm/types.h
- * can override it.
+ * The type of an index into the pagecache.
*/
-#ifndef pgoff_t
#define pgoff_t unsigned long
-#endif
/* A dma_addr_t can hold any valid DMA or bus address for the platform */
#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index f14bd75f08b3..56529b34dc63 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -36,7 +36,8 @@ enum zpool_mapmode {
ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
};
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops);
+struct zpool *zpool_create_pool(char *type, char *name,
+ gfp_t gfp, struct zpool_ops *ops);
char *zpool_get_type(struct zpool *pool);
@@ -80,7 +81,7 @@ struct zpool_driver {
atomic_t refcount;
struct list_head list;
- void *(*create)(gfp_t gfp, struct zpool_ops *ops);
+ void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops);
void (*destroy)(void *pool);
int (*malloc)(void *pool, size_t size, gfp_t gfp,
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 05c214760977..3283c6a55425 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -36,7 +36,7 @@ enum zs_mapmode {
struct zs_pool;
-struct zs_pool *zs_create_pool(gfp_t flags);
+struct zs_pool *zs_create_pool(char *name, gfp_t flags);
void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size);
diff --git a/include/net/sock.h b/include/net/sock.h
index e13824570b0f..ab186b1d31ff 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1077,11 +1077,6 @@ static inline bool memcg_proto_active(struct cg_proto *cg_proto)
return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
}
-static inline bool memcg_proto_activated(struct cg_proto *cg_proto)
-{
- return test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags);
-}
-
#ifdef SOCK_REFCNT_DEBUG
static inline void sk_refcnt_debug_inc(struct sock *sk)
{
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 0d11c3dcd3a1..9cd8b21dddbe 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -67,7 +67,7 @@ enum mpol_rebind_step {
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
-#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
+#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
#endif /* _UAPI_LINUX_MEMPOLICY_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 04cfe8ace520..d5f6ec251fb2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4373,16 +4373,20 @@ static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
- if (css->ss) {
+ if (ss) {
/* css free path */
+ int id = css->id;
+
if (css->parent)
css_put(css->parent);
- css->ss->css_free(css);
+ ss->css_free(css);
+ cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
} else {
/* cgroup free path */
@@ -4434,7 +4438,7 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
- cgroup_idr_remove(&ss->css_idr, css->id);
+ cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
} else {
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
* core implementation decides to return random nonsense.
*/
if (ret == -ERESTART_RESTARTBLOCK) {
- struct restart_block *restart
- = &current_thread_info()->restart_block;
+ struct restart_block *restart = &current->restart_block;
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = compat_clock_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b257f6bca2..c54a1dae6c11 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2400,7 +2400,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
*/
}
-void cpuset_init_current_mems_allowed(void)
+void __init cpuset_init_current_mems_allowed(void)
{
nodes_setall(current->mems_allowed);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 66e19c251581..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,9 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
-#ifndef __PAGETABLE_PMD_FOLDED
- atomic_long_set(&mm->nr_pmds, 0);
-#endif
+ mm_nr_pmds_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
mm->pinned_vm = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 4eeb63de7e54..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
if (!abs_time)
goto out;
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d28796..c06df7de0963 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
- "print all kernel messages to the console.");
+MODULE_PARM_DESC(ignore_loglevel,
+ "ignore loglevel setting (prints all kernel messages to the console)");
#ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
}
/*
- * Zap console related locks when oopsing. Only zap at most once
- * every 10 seconds, to leave time for slow consoles to print a
- * full oops.
+ * Zap console related locks when oopsing.
+ * To leave time for slow consoles to print a full oops,
+ * only zap at most once every 30 seconds.
*/
static void zap_locks(void)
{
static unsigned long oops_timestamp;
if (time_after_eq(jiffies, oops_timestamp) &&
- !time_after(jiffies, oops_timestamp + 30 * HZ))
+ !time_after(jiffies, oops_timestamp + 30 * HZ))
return;
oops_timestamp = jiffies;
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
EXPORT_SYMBOL_GPL(cpu_clock);
EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+ return local_clock();
+}
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..33a52759cc0e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
*/
SYSCALL_DEFINE0(restart_syscall)
{
- struct restart_block *restart = &current_thread_info()->restart_block;
+ struct restart_block *restart = &current->restart_block;
return restart->fn(restart);
}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3f5e183c3d97..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1583,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
struct timespec *rqtp, struct timespec __user *rmtp)
{
- struct restart_block *restart_block =
- &current_thread_info()->restart_block;
+ struct restart_block *restart_block = &current->restart_block;
struct itimerspec it;
int error;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
*/
static unsigned long get_timestamp(void)
{
- return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
+ return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
}
static void set_sample_period(void)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e5ea3ab856bf..79a9bb67aeaf 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1580,6 +1580,9 @@ config ASYNC_RAID6_TEST
If unsure, say N.
+config TEST_HEXDUMP
+ tristate "Test functions located in the hexdump module at runtime"
+
config TEST_STRING_HELPERS
tristate "Test functions located in the string_helpers module at runtime"
diff --git a/lib/Makefile b/lib/Makefile
index 25c061f77df7..e456defd1021 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -23,12 +23,14 @@ lib-y += kobject.o klist.o
obj-y += lockref.o
obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
- bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
+ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \
bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
+obj-y += hexdump.o
+obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o
obj-y += kstrtox.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
obj-$(CONFIG_TEST_LKM) += test_module.o
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 324ea9eab8c1..ad161a6c82db 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -744,10 +744,10 @@ EXPORT_SYMBOL(bitmap_parselist_user);
/**
* bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
* @buf: pointer to a bitmap
- * @pos: a bit position in @buf (0 <= @pos < @bits)
- * @bits: number of valid bit positions in @buf
+ * @pos: a bit position in @buf (0 <= @pos < @nbits)
+ * @nbits: number of valid bit positions in @buf
*
- * Map the bit at position @pos in @buf (of length @bits) to the
+ * Map the bit at position @pos in @buf (of length @nbits) to the
* ordinal of which set bit it is. If it is not set or if @pos
* is not a valid bit position, map to -1.
*
@@ -759,56 +759,40 @@ EXPORT_SYMBOL(bitmap_parselist_user);
*
* The bit positions 0 through @bits are valid positions in @buf.
*/
-static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits)
+static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
- int i, ord;
-
- if (pos < 0 || pos >= bits || !test_bit(pos, buf))
+ if (pos >= nbits || !test_bit(pos, buf))
return -1;
- i = find_first_bit(buf, bits);
- ord = 0;
- while (i < pos) {
- i = find_next_bit(buf, bits, i + 1);
- ord++;
- }
- BUG_ON(i != pos);
-
- return ord;
+ return __bitmap_weight(buf, pos);
}
/**
* bitmap_ord_to_pos - find position of n-th set bit in bitmap
* @buf: pointer to bitmap
* @ord: ordinal bit position (n-th set bit, n >= 0)
- * @bits: number of valid bit positions in @buf
+ * @nbits: number of valid bit positions in @buf
*
* Map the ordinal offset of bit @ord in @buf to its position in @buf.
- * Value of @ord should be in range 0 <= @ord < weight(buf), else
- * results are undefined.
+ * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord
+ * >= weight(buf), returns @nbits.
*
* If for example, just bits 4 through 7 are set in @buf, then @ord
* values 0 through 3 will get mapped to 4 through 7, respectively,
- * and all other @ord values return undefined values. When @ord value 3
+ * and all other @ord values returns @nbits. When @ord value 3
* gets mapped to (returns) @pos value 7 in this example, that means
* that the 3rd set bit (starting with 0th) is at position 7 in @buf.
*
- * The bit positions 0 through @bits are valid positions in @buf.
+ * The bit positions 0 through @nbits-1 are valid positions in @buf.
*/
-int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits)
{
- int pos = 0;
-
- if (ord >= 0 && ord < bits) {
- int i;
+ unsigned int pos;
- for (i = find_first_bit(buf, bits);
- i < bits && ord > 0;
- i = find_next_bit(buf, bits, i + 1))
- ord--;
- if (i < bits && ord == 0)
- pos = i;
- }
+ for (pos = find_first_bit(buf, nbits);
+ pos < nbits && ord;
+ pos = find_next_bit(buf, nbits, pos + 1))
+ ord--;
return pos;
}
@@ -819,7 +803,7 @@ int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
* @src: subset to be remapped
* @old: defines domain of map
* @new: defines range of map
- * @bits: number of bits in each of these bitmaps
+ * @nbits: number of bits in each of these bitmaps
*
* Let @old and @new define a mapping of bit positions, such that
* whatever position is held by the n-th set bit in @old is mapped
@@ -847,22 +831,22 @@ int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
*/
void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new,
- int bits)
+ unsigned int nbits)
{
- int oldbit, w;
+ unsigned int oldbit, w;
if (dst == src) /* following doesn't handle inplace remaps */
return;
- bitmap_zero(dst, bits);
+ bitmap_zero(dst, nbits);
- w = bitmap_weight(new, bits);
- for_each_set_bit(oldbit, src, bits) {
- int n = bitmap_pos_to_ord(old, oldbit, bits);
+ w = bitmap_weight(new, nbits);
+ for_each_set_bit(oldbit, src, nbits) {
+ int n = bitmap_pos_to_ord(old, oldbit, nbits);
if (n < 0 || w == 0)
set_bit(oldbit, dst); /* identity map */
else
- set_bit(bitmap_ord_to_pos(new, n % w, bits), dst);
+ set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
}
}
EXPORT_SYMBOL(bitmap_remap);
@@ -1006,9 +990,9 @@ EXPORT_SYMBOL(bitmap_bitremap);
* All bits in @dst not set by the above rule are cleared.
*/
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
- const unsigned long *relmap, int bits)
+ const unsigned long *relmap, unsigned int bits)
{
- int n, m; /* same meaning as in above comment */
+ unsigned int n, m; /* same meaning as in above comment */
if (dst == orig) /* following doesn't handle inplace mappings */
return;
@@ -1039,22 +1023,22 @@ EXPORT_SYMBOL(bitmap_onto);
* @dst: resulting smaller bitmap
* @orig: original larger bitmap
* @sz: specified size
- * @bits: number of bits in each of these bitmaps
+ * @nbits: number of bits in each of these bitmaps
*
* For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
* Clear all other bits in @dst. See further the comment and
* Example [2] for bitmap_onto() for why and how to use this.
*/
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
- int sz, int bits)
+ unsigned int sz, unsigned int nbits)
{
- int oldbit;
+ unsigned int oldbit;
if (dst == orig) /* following doesn't handle inplace mappings */
return;
- bitmap_zero(dst, bits);
+ bitmap_zero(dst, nbits);
- for_each_set_bit(oldbit, orig, bits)
+ for_each_set_bit(oldbit, orig, nbits)
set_bit(oldbit % sz, dst);
}
EXPORT_SYMBOL(bitmap_fold);
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index 0777c5a45fa0..f346715e2255 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -3,12 +3,12 @@
*
* Copyright (c) 2011, Tom Herbert <therbert@google.com>
*/
-#include <linux/module.h>
#include <linux/types.h>
-#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/dynamic_queue_limits.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
#define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 2e65d206b01c..0fe1cbe87700 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -34,7 +34,6 @@
#include <linux/rculist.h>
#include <linux/interrupt.h>
#include <linux/genalloc.h>
-#include <linux/of_address.h>
#include <linux/of_device.h>
static inline size_t chunk_size(const struct gen_pool_chunk *chunk)
@@ -415,7 +414,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
size_t size)
{
bool found = false;
- unsigned long end = start + size;
+ unsigned long end = start + size - 1;
struct gen_pool_chunk *chunk;
rcu_read_lock();
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
index 66d0ee8b7776..a8fe6274a13c 100644
--- a/lib/halfmd4.c
+++ b/lib/halfmd4.c
@@ -1,4 +1,4 @@
-#include <linux/kernel.h>
+#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/cryptohash.h>
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 270773b91923..7ea09699855d 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -97,63 +97,79 @@ EXPORT_SYMBOL(bin2hex);
*
* example output buffer:
* 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO
+ *
+ * Return:
+ * The amount of bytes placed in the buffer without terminating NUL. If the
+ * output was truncated, then the return value is the number of bytes
+ * (excluding the terminating NUL) which would have been written to the final
+ * string if enough space had been available.
*/
-void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
- int groupsize, char *linebuf, size_t linebuflen,
- bool ascii)
+int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
+ char *linebuf, size_t linebuflen, bool ascii)
{
const u8 *ptr = buf;
+ int ngroups;
u8 ch;
int j, lx = 0;
int ascii_column;
+ int ret;
if (rowsize != 16 && rowsize != 32)
rowsize = 16;
- if (!len)
- goto nil;
if (len > rowsize) /* limit to one line at a time */
len = rowsize;
+ if (!is_power_of_2(groupsize) || groupsize > 8)
+ groupsize = 1;
if ((len % groupsize) != 0) /* no mixed size output */
groupsize = 1;
- switch (groupsize) {
- case 8: {
- const u64 *ptr8 = buf;
- int ngroups = len / groupsize;
+ ngroups = len / groupsize;
+ ascii_column = rowsize * 2 + rowsize / groupsize + 1;
- for (j = 0; j < ngroups; j++)
- lx += scnprintf(linebuf + lx, linebuflen - lx,
- "%s%16.16llx", j ? " " : "",
- (unsigned long long)*(ptr8 + j));
- ascii_column = 17 * ngroups + 2;
- break;
- }
+ if (!linebuflen)
+ goto overflow1;
- case 4: {
- const u32 *ptr4 = buf;
- int ngroups = len / groupsize;
+ if (!len)
+ goto nil;
- for (j = 0; j < ngroups; j++)
- lx += scnprintf(linebuf + lx, linebuflen - lx,
- "%s%8.8x", j ? " " : "", *(ptr4 + j));
- ascii_column = 9 * ngroups + 2;
- break;
- }
+ if (groupsize == 8) {
+ const u64 *ptr8 = buf;
- case 2: {
- const u16 *ptr2 = buf;
- int ngroups = len / groupsize;
+ for (j = 0; j < ngroups; j++) {
+ ret = snprintf(linebuf + lx, linebuflen - lx,
+ "%s%16.16llx", j ? " " : "",
+ (unsigned long long)*(ptr8 + j));
+ if (ret >= linebuflen - lx)
+ goto overflow1;
+ lx += ret;
+ }
+ } else if (groupsize == 4) {
+ const u32 *ptr4 = buf;
- for (j = 0; j < ngroups; j++)
- lx += scnprintf(linebuf + lx, linebuflen - lx,
- "%s%4.4x", j ? " " : "", *(ptr2 + j));
- ascii_column = 5 * ngroups + 2;
- break;
- }
+ for (j = 0; j < ngroups; j++) {
+ ret = snprintf(linebuf + lx, linebuflen - lx,
+ "%s%8.8x", j ? " " : "",
+ *(ptr4 + j));
+ if (ret >= linebuflen - lx)
+ goto overflow1;
+ lx += ret;
+ }
+ } else if (groupsize == 2) {
+ const u16 *ptr2 = buf;
- default:
- for (j = 0; (j < len) && (lx + 3) <= linebuflen; j++) {
+ for (j = 0; j < ngroups; j++) {
+ ret = snprintf(linebuf + lx, linebuflen - lx,
+ "%s%4.4x", j ? " " : "",
+ *(ptr2 + j));
+ if (ret >= linebuflen - lx)
+ goto overflow1;
+ lx += ret;
+ }
+ } else {
+ for (j = 0; j < len; j++) {
+ if (linebuflen < lx + 3)
+ goto overflow2;
ch = ptr[j];
linebuf[lx++] = hex_asc_hi(ch);
linebuf[lx++] = hex_asc_lo(ch);
@@ -161,21 +177,28 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
}
if (j)
lx--;
-
- ascii_column = 3 * rowsize + 2;
- break;
}
if (!ascii)
goto nil;
- while (lx < (linebuflen - 1) && lx < (ascii_column - 1))
+ while (lx < ascii_column) {
+ if (linebuflen < lx + 2)
+ goto overflow2;
linebuf[lx++] = ' ';
- for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) {
+ }
+ for (j = 0; j < len; j++) {
+ if (linebuflen < lx + 2)
+ goto overflow2;
ch = ptr[j];
linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.';
}
nil:
+ linebuf[lx] = '\0';
+ return lx;
+overflow2:
linebuf[lx++] = '\0';
+overflow1:
+ return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1;
}
EXPORT_SYMBOL(hex_dump_to_buffer);
diff --git a/lib/idr.c b/lib/idr.c
index e654aebd5f80..5335c43adf46 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -30,7 +30,6 @@
#include <linux/idr.h>
#include <linux/spinlock.h>
#include <linux/percpu.h>
-#include <linux/hardirq.h>
#define MAX_IDR_SHIFT (sizeof(int) * 8 - 1)
#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
index f367f9ad544c..c85f6600a5f8 100644
--- a/lib/interval_tree.c
+++ b/lib/interval_tree.c
@@ -1,7 +1,7 @@
-#include <linux/init.h>
#include <linux/interval_tree.h>
#include <linux/interval_tree_generic.h>
-#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
#define START(node) ((node)->start)
#define LAST(node) ((node)->last)
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 9ebf9e20de53..f6c2c1e7779c 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -20,7 +20,6 @@
#include <linux/export.h>
#include <linux/kmod.h>
#include <linux/slab.h>
-#include <linux/user_namespace.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
diff --git a/lib/lcm.c b/lib/lcm.c
index 51cc6b13cd52..e97dbd51e756 100644
--- a/lib/lcm.c
+++ b/lib/lcm.c
@@ -1,4 +1,4 @@
-#include <linux/kernel.h>
+#include <linux/compiler.h>
#include <linux/gcd.h>
#include <linux/export.h>
#include <linux/lcm.h>
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 12bcba1c8612..b29015102698 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -2,9 +2,11 @@
#define pr_fmt(fmt) "list_sort_test: " fmt
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/string.h>
#include <linux/list_sort.h>
-#include <linux/slab.h>
#include <linux/list.h>
#define MAX_LIST_LENGTH_BITS 20
@@ -146,6 +148,7 @@ EXPORT_SYMBOL(list_sort);
#ifdef CONFIG_TEST_LIST_SORT
+#include <linux/slab.h>
#include <linux/random.h>
/*
diff --git a/lib/llist.c b/lib/llist.c
index f76196d07409..0b0e9779d675 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -24,7 +24,6 @@
*/
#include <linux/kernel.h>
#include <linux/export.h>
-#include <linux/interrupt.h>
#include <linux/llist.h>
diff --git a/lib/md5.c b/lib/md5.c
index 958a3c15923c..bb0cd01d356d 100644
--- a/lib/md5.c
+++ b/lib/md5.c
@@ -1,4 +1,4 @@
-#include <linux/kernel.h>
+#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/cryptohash.h>
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 9c3e85ff0a6c..76a1b59523ab 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/jiffies.h>
-#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index 93d145e5539c..f75715131f20 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -19,13 +19,10 @@
#include <linux/bug.h>
#include <linux/err.h>
#include <linux/export.h>
-#include <linux/hardirq.h>
-#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/sched.h>
-#include <linux/slab.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/percpu_ida.h>
diff --git a/lib/plist.c b/lib/plist.c
index d408e774b746..3a30c53db061 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -25,7 +25,6 @@
#include <linux/bug.h>
#include <linux/plist.h>
-#include <linux/spinlock.h>
#ifdef CONFIG_DEBUG_PI_LIST
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3291a8e37490..3d2aa27b845b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -33,7 +33,7 @@
#include <linux/string.h>
#include <linux/bitops.h>
#include <linux/rcupdate.h>
-#include <linux/hardirq.h> /* in_interrupt() */
+#include <linux/preempt_mask.h> /* in_interrupt() */
/*
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 7de89f4a36cf..adc98e1825ba 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -6,7 +6,6 @@
*/
#include <linux/mm.h>
-#include <linux/nmi.h>
#include <linux/quicklist.h>
#include <linux/cma.h>
diff --git a/lib/sort.c b/lib/sort.c
index 926d00429ed2..43c9fe73ae2e 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -4,10 +4,9 @@
* Jan 23 2005 Matt Mackall <mpm@selenic.com>
*/
-#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/export.h>
#include <linux/sort.h>
-#include <linux/slab.h>
static void u32_swap(void *a, void *b, int size)
{
@@ -85,6 +84,7 @@ void sort(void *base, size_t num, size_t size,
EXPORT_SYMBOL(sort);
#if 0
+#include <linux/slab.h>
/* a simple boot-time regression test */
int cmpint(const void *a, const void *b)
diff --git a/lib/stmp_device.c b/lib/stmp_device.c
index 8ac9bcc4289a..a904656f4fd7 100644
--- a/lib/stmp_device.c
+++ b/lib/stmp_device.c
@@ -15,7 +15,8 @@
#include <linux/io.h>
#include <linux/errno.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
#include <linux/stmp_device.h>
#define STMP_MODULE_CLKGATE (1 << 30)
diff --git a/lib/string.c b/lib/string.c
index 10063300b830..3206d0178296 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -58,14 +58,6 @@ int strncasecmp(const char *s1, const char *s2, size_t len)
}
EXPORT_SYMBOL(strncasecmp);
#endif
-#ifndef __HAVE_ARCH_STRNICMP
-#undef strnicmp
-int strnicmp(const char *s1, const char *s2, size_t len)
-{
- return strncasecmp(s1, s2, len);
-}
-EXPORT_SYMBOL(strnicmp);
-#endif
#ifndef __HAVE_ARCH_STRCASECMP
int strcasecmp(const char *s1, const char *s2)
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 58b78ba57439..8f8c4417f228 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -20,19 +20,18 @@
* @len: length of buffer
*
* This function returns a string formatted to 3 significant figures
- * giving the size in the required units. Returns 0 on success or
- * error on failure. @buf is always zero terminated.
+ * giving the size in the required units. @buf should have room for
+ * at least 9 bytes and will always be zero terminated.
*
*/
-int string_get_size(u64 size, const enum string_size_units units,
- char *buf, int len)
+void string_get_size(u64 size, const enum string_size_units units,
+ char *buf, int len)
{
static const char *const units_10[] = {
- "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", NULL
+ "B", "kB", "MB", "GB", "TB", "PB", "EB"
};
static const char *const units_2[] = {
- "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB",
- NULL
+ "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"
};
static const char *const *const units_str[] = {
[STRING_UNITS_10] = units_10,
@@ -43,13 +42,13 @@ int string_get_size(u64 size, const enum string_size_units units,
[STRING_UNITS_2] = 1024,
};
int i, j;
- u64 remainder = 0, sf_cap;
+ u32 remainder = 0, sf_cap;
char tmp[8];
tmp[0] = '\0';
i = 0;
if (size >= divisor[units]) {
- while (size >= divisor[units] && units_str[units][i]) {
+ while (size >= divisor[units]) {
remainder = do_div(size, divisor[units]);
i++;
}
@@ -60,17 +59,14 @@ int string_get_size(u64 size, const enum string_size_units units,
if (j) {
remainder *= 1000;
- do_div(remainder, divisor[units]);
- snprintf(tmp, sizeof(tmp), ".%03lld",
- (unsigned long long)remainder);
+ remainder /= divisor[units];
+ snprintf(tmp, sizeof(tmp), ".%03u", remainder);
tmp[j+1] = '\0';
}
}
- snprintf(buf, len, "%lld%s %s", (unsigned long long)size,
+ snprintf(buf, len, "%u%s %s", (u32)size,
tmp, units_str[units][i]);
-
- return 0;
}
EXPORT_SYMBOL(string_get_size);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index bb2b201d6ad0..e0af6ff73d14 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,4 +1,5 @@
-#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c
new file mode 100644
index 000000000000..daf29a390a89
--- /dev/null
+++ b/lib/test-hexdump.c
@@ -0,0 +1,180 @@
+/*
+ * Test cases for lib/hexdump.c module.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/string.h>
+
+static const unsigned char data_b[] = {
+ '\xbe', '\x32', '\xdb', '\x7b', '\x0a', '\x18', '\x93', '\xb2', /* 00 - 07 */
+ '\x70', '\xba', '\xc4', '\x24', '\x7d', '\x83', '\x34', '\x9b', /* 08 - 0f */
+ '\xa6', '\x9c', '\x31', '\xad', '\x9c', '\x0f', '\xac', '\xe9', /* 10 - 17 */
+ '\x4c', '\xd1', '\x19', '\x99', '\x43', '\xb1', '\xaf', '\x0c', /* 18 - 1f */
+};
+
+static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C...";
+
+static const char *test_data_1_le[] __initconst = {
+ "be", "32", "db", "7b", "0a", "18", "93", "b2",
+ "70", "ba", "c4", "24", "7d", "83", "34", "9b",
+ "a6", "9c", "31", "ad", "9c", "0f", "ac", "e9",
+ "4c", "d1", "19", "99", "43", "b1", "af", "0c",
+};
+
+static const char *test_data_2_le[] __initconst = {
+ "32be", "7bdb", "180a", "b293",
+ "ba70", "24c4", "837d", "9b34",
+ "9ca6", "ad31", "0f9c", "e9ac",
+ "d14c", "9919", "b143", "0caf",
+};
+
+static const char *test_data_4_le[] __initconst = {
+ "7bdb32be", "b293180a", "24c4ba70", "9b34837d",
+ "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143",
+};
+
+static const char *test_data_8_le[] __initconst = {
+ "b293180a7bdb32be", "9b34837d24c4ba70",
+ "e9ac0f9cad319ca6", "0cafb1439919d14c",
+};
+
+static void __init test_hexdump(size_t len, int rowsize, int groupsize,
+ bool ascii)
+{
+ char test[32 * 3 + 2 + 32 + 1];
+ char real[32 * 3 + 2 + 32 + 1];
+ char *p;
+ const char **result;
+ size_t l = len;
+ int gs = groupsize, rs = rowsize;
+ unsigned int i;
+
+ hex_dump_to_buffer(data_b, l, rs, gs, real, sizeof(real), ascii);
+
+ if (rs != 16 && rs != 32)
+ rs = 16;
+
+ if (l > rs)
+ l = rs;
+
+ if (!is_power_of_2(gs) || gs > 8 || (len % gs != 0))
+ gs = 1;
+
+ if (gs == 8)
+ result = test_data_8_le;
+ else if (gs == 4)
+ result = test_data_4_le;
+ else if (gs == 2)
+ result = test_data_2_le;
+ else
+ result = test_data_1_le;
+
+ memset(test, ' ', sizeof(test));
+
+ /* hex dump */
+ p = test;
+ for (i = 0; i < l / gs; i++) {
+ const char *q = *result++;
+ size_t amount = strlen(q);
+
+ strncpy(p, q, amount);
+ p += amount + 1;
+ }
+ if (i)
+ p--;
+
+ /* ASCII part */
+ if (ascii) {
+ p = test + rs * 2 + rs / gs + 1;
+ strncpy(p, data_a, l);
+ p += l;
+ }
+
+ *p = '\0';
+
+ if (strcmp(test, real)) {
+ pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize);
+ pr_err("Result: '%s'\n", real);
+ pr_err("Expect: '%s'\n", test);
+ }
+}
+
+static void __init test_hexdump_set(int rowsize, bool ascii)
+{
+ size_t d = min_t(size_t, sizeof(data_b), rowsize);
+ size_t len = get_random_int() % d + 1;
+
+ test_hexdump(len, rowsize, 4, ascii);
+ test_hexdump(len, rowsize, 2, ascii);
+ test_hexdump(len, rowsize, 8, ascii);
+ test_hexdump(len, rowsize, 1, ascii);
+}
+
+static void __init test_hexdump_overflow(bool ascii)
+{
+ char buf[56];
+ const char *t = test_data_1_le[0];
+ size_t l = get_random_int() % sizeof(buf);
+ bool a;
+ int e, r;
+
+ memset(buf, ' ', sizeof(buf));
+
+ r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii);
+
+ if (ascii)
+ e = 50;
+ else
+ e = 2;
+ buf[e + 2] = '\0';
+
+ if (!l) {
+ a = r == e && buf[0] == ' ';
+ } else if (l < 3) {
+ a = r == e && buf[0] == '\0';
+ } else if (l < 4) {
+ a = r == e && !strcmp(buf, t);
+ } else if (ascii) {
+ if (l < 51)
+ a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' ';
+ else
+ a = r == e && buf[50] == '\0' && buf[49] == '.';
+ } else {
+ a = r == e && buf[e] == '\0';
+ }
+
+ if (!a) {
+ pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf));
+ pr_err("Result: '%s'\n", buf);
+ }
+}
+
+static int __init test_hexdump_init(void)
+{
+ unsigned int i;
+ int rowsize;
+
+ pr_info("Running tests...\n");
+
+ rowsize = (get_random_int() % 2 + 1) * 16;
+ for (i = 0; i < 16; i++)
+ test_hexdump_set(rowsize, false);
+
+ rowsize = (get_random_int() % 2 + 1) * 16;
+ for (i = 0; i < 16; i++)
+ test_hexdump_set(rowsize, true);
+
+ for (i = 0; i < 16; i++)
+ test_hexdump_overflow(false);
+
+ for (i = 0; i < 16; i++)
+ test_hexdump_overflow(true);
+
+ return -EINVAL;
+}
+module_init(test_hexdump_init);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index ec337f64f52d..602d2081e713 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -114,8 +114,9 @@ int skip_atoi(const char **s)
{
int i = 0;
- while (isdigit(**s))
+ do {
i = i*10 + *((*s)++) - '0';
+ } while (isdigit(**s));
return i;
}
@@ -1604,8 +1605,7 @@ qualifier:
case 'p':
spec->type = FORMAT_TYPE_PTR;
- return fmt - start;
- /* skip alnum */
+ return ++fmt - start;
case '%':
spec->type = FORMAT_TYPE_PERCENT_CHAR;
@@ -1728,7 +1728,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
/* Reject out-of-range values early. Large positive sizes are
used for unknown buffer sizes. */
- if (WARN_ON_ONCE((int) size < 0))
+ if (WARN_ON_ONCE(size > INT_MAX))
return 0;
str = buf;
@@ -1794,7 +1794,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
break;
case FORMAT_TYPE_PTR:
- str = pointer(fmt+1, str, end, va_arg(args, void *),
+ str = pointer(fmt, str, end, va_arg(args, void *),
spec);
while (isalnum(*fmt))
fmt++;
@@ -2232,7 +2232,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
}
case FORMAT_TYPE_PTR:
- str = pointer(fmt+1, str, end, get_arg(void *), spec);
+ str = pointer(fmt, str, end, get_arg(void *), spec);
while (isalnum(*fmt))
fmt++;
break;
diff --git a/mm/Kconfig b/mm/Kconfig
index 4395b12869c8..de5239c152f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -602,6 +602,16 @@ config PGTABLE_MAPPING
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about whats happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
config GENERIC_EARLY_IOREMAP
bool
diff --git a/mm/compaction.c b/mm/compaction.c
index b68736c8a1ce..d50d6de6f1b6 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -490,6 +490,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* If a page was split, advance to the end of it */
if (isolated) {
+ cc->nr_freepages += isolated;
+ if (!strict &&
+ cc->nr_migratepages <= cc->nr_freepages) {
+ blockpfn += isolated;
+ break;
+ }
+
blockpfn += isolated - 1;
cursor += isolated - 1;
continue;
@@ -899,7 +906,6 @@ static void isolate_freepages(struct compact_control *cc)
unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
- int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
@@ -924,11 +930,11 @@ static void isolate_freepages(struct compact_control *cc)
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+ for (; block_start_pfn >= low_pfn &&
+ cc->nr_migratepages > cc->nr_freepages;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
- unsigned long isolated;
/*
* This can iterate a massively long zone without finding any
@@ -953,9 +959,8 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Found a block suitable for isolating free pages from. */
- isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ isolate_freepages_block(cc, &isolate_start_pfn,
block_end_pfn, freelist, false);
- nr_freepages += isolated;
/*
* Remember where the free scanner should restart next time,
@@ -987,8 +992,6 @@ static void isolate_freepages(struct compact_control *cc)
*/
if (block_start_pfn < low_pfn)
cc->free_pfn = cc->migrate_pfn;
-
- cc->nr_freepages = nr_freepages;
}
/*
@@ -1100,8 +1103,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
isolate_mode);
- if (!low_pfn || cc->contended)
+ if (!low_pfn || cc->contended) {
+ acct_isolated(zone, cc);
return ISOLATE_ABORT;
+ }
/*
* Either we isolated something and proceed with migration. Or
@@ -1173,7 +1178,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
- if (cc->order >= pageblock_order && area->nr_free)
+ if (order >= pageblock_order && area->nr_free)
return COMPACT_PARTIAL;
}
diff --git a/mm/gup.c b/mm/gup.c
index c2da1163986a..51bf0b06ca7b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -64,7 +64,7 @@ retry:
migration_entry_wait(mm, pmd, address);
goto retry;
}
- if ((flags & FOLL_NUMA) && pte_numa(pte))
+ if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte)) {
pte_unmap_unlock(ptep, ptl);
@@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
@@ -906,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
/*
* Similar to the PMD case below, NUMA hinting must take slow
- * path
+ * path using the pte_protnone check.
*/
if (!pte_present(pte) || pte_special(pte) ||
- pte_numa(pte) || (write && !pte_write(pte)))
+ pte_protnone(pte) || (write && !pte_write(pte)))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1104,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
- if (pmd_numa(pmd))
+ if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb7be110cad3..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1211,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out;
page = pmd_page(*pmd);
@@ -1262,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool migrated = false;
int flags = 0;
+ /* A PROT_NONE fault should not end up here */
+ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
@@ -1272,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
+ page = pmd_page(*pmdp);
spin_unlock(ptl);
- wait_migrate_huge_page(vma->anon_vma, pmdp);
+ wait_on_page_locked(page);
goto out;
}
@@ -1341,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Migrate the THP to the requested node, returns with page unlocked
- * and pmd_numa cleared.
+ * and access rights restored.
*/
spin_unlock(ptl);
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1354,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
- pmd = pmd_mknonnuma(pmd);
+ pmd = pmd_modify(pmd, vma->vm_page_prot);
set_pmd_at(mm, haddr, pmdp, pmd);
- VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out_unlock:
@@ -1479,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
- ret = 1;
- if (!prot_numa) {
+
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!prot_numa || !pmd_protnone(*pmd)) {
+ ret = 1;
entry = pmdp_get_and_clear_notify(mm, addr, pmd);
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
- } else {
- struct page *page = pmd_page(*pmd);
-
- /*
- * Do not trap faults against the zero page. The
- * read-only data is likely to be read-cached on the
- * local CPU cache and it is less useful to know about
- * local vs remote hits on the zero page.
- */
- if (!is_huge_zero_page(page) &&
- !pmd_numa(*pmd)) {
- pmdp_set_numa(mm, addr, pmd);
- ret = HPAGE_PMD_NR;
- }
}
spin_unlock(ptl);
}
@@ -1766,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
/*
- * Note that pmd_numa is not transferred deliberately
- * to avoid any possibility that pte_numa leaks to
- * a PROT_NONE VMA by accident.
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
*/
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/internal.h b/mm/internal.h
index c4d6c9b43491..a96da5b0029d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -351,8 +351,10 @@ extern int mminit_loglevel;
#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
if (level < mminit_loglevel) { \
- printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
- printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+ if (level <= MMINIT_WARNING) \
+ printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+ else \
+ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
} while (0)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_add(&lru->list, &list_lrus);
+ mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_del(&lru->list);
+ mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return !!lru->node[0].memcg_lrus;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ /*
+ * The lock protects the array of per cgroup lists from relocation
+ * (see memcg_update_list_lru_node).
+ */
+ lockdep_assert_held(&nlru->lock);
+ if (nlru->memcg_lrus && idx >= 0)
+ return nlru->memcg_lrus->lru[idx];
+
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+ struct mem_cgroup *memcg;
+
+ if (!nlru->memcg_lrus)
+ return &nlru->lru;
+
+ memcg = mem_cgroup_from_kmem(ptr);
+ if (!memcg)
+ return &nlru->lru;
+
+ return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+ return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l = list_lru_from_kmem(nlru, item);
if (list_empty(item)) {
- list_add_tail(item, &nlru->list);
- if (nlru->nr_items++ == 0)
- node_set(nid, lru->active_nodes);
+ list_add_tail(item, &l->list);
+ l->nr_items++;
spin_unlock(&nlru->lock);
return true;
}
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
spin_lock(&nlru->lock);
+ l = list_lru_from_kmem(nlru, item);
if (!list_empty(item)) {
list_del_init(item);
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l->nr_items--;
spin_unlock(&nlru->lock);
return true;
}
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+ list_del_init(item);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+ struct list_head *head)
+{
+ list_move(item, head);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+ int nid, int memcg_idx)
{
- unsigned long count = 0;
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+ unsigned long count;
spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
- count += nlru->nr_items;
+ l = list_lru_from_memcg_idx(nlru, memcg_idx);
+ count = l->nr_items;
spin_unlock(&nlru->lock);
return count;
}
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
+{
+ return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+ long count = 0;
+ int memcg_idx;
+
+ count += __list_lru_count_one(lru, nid, -1);
+ if (list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx)
+ count += __list_lru_count_one(lru, nid, memcg_idx);
+ }
+ return count;
+}
EXPORT_SYMBOL_GPL(list_lru_count_node);
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
spin_lock(&nlru->lock);
+ l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
- list_for_each_safe(item, n, &nlru->list) {
+ list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
/*
@@ -85,14 +206,11 @@ restart:
break;
--*nr_to_walk;
- ret = isolate(item, &nlru->lock, cb_arg);
+ ret = isolate(item, l, &nlru->lock, cb_arg);
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
isolated++;
/*
* If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
goto restart;
break;
case LRU_ROTATE:
- list_move_tail(item, &nlru->list);
+ list_move_tail(item, &l->list);
break;
case LRU_SKIP:
break;
@@ -122,31 +240,322 @@ restart:
spin_unlock(&nlru->lock);
return isolated;
}
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+ isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ long isolated = 0;
+ int memcg_idx;
+
+ isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+ nr_to_walk);
+ if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx) {
+ isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+ isolate, cb_arg, nr_to_walk);
+ if (*nr_to_walk <= 0)
+ break;
+ }
+ }
+ return isolated;
+}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+ INIT_LIST_HEAD(&l->list);
+ l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++)
+ kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++) {
+ struct list_lru_one *l;
+
+ l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+ if (!l)
+ goto fail;
+
+ init_one_lru(l);
+ memcg_lrus->lru[i] = l;
+ }
+ return 0;
+fail:
+ __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+ return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+ int size = memcg_nr_cache_ids;
+
+ nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+ if (!nlru->memcg_lrus)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+ kfree(nlru->memcg_lrus);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+ __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+ kfree(nlru->memcg_lrus);
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ struct list_lru_memcg *old, *new;
+
+ BUG_ON(old_size > new_size);
+
+ old = nlru->memcg_lrus;
+ new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+ kfree(new);
+ return -ENOMEM;
+ }
+
+ memcpy(new, old, old_size * sizeof(void *));
+
+ /*
+ * The lock guarantees that we won't race with a reader
+ * (see list_lru_from_memcg_idx).
+ *
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+ nlru->memcg_lrus = new;
+ spin_unlock_irq(&nlru->lock);
+
+ kfree(old);
+ return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ /* do not bother shrinking the array back to the old size, because we
+ * cannot handle allocation failures here */
+ __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ if (!memcg_aware)
+ lru->node[i].memcg_lrus = NULL;
+ else if (memcg_init_list_lru_node(&lru->node[i]))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--)
+ memcg_destroy_list_lru_node(&lru->node[i]);
+ return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return 0;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ if (memcg_update_list_lru_node(&lru->node[i],
+ old_size, new_size))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--)
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+ return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+ int ret = 0;
+ struct list_lru *lru;
+ int old_size = memcg_nr_cache_ids;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list) {
+ ret = memcg_update_list_lru(lru, old_size, new_size);
+ if (ret)
+ goto fail;
+ }
+out:
+ mutex_unlock(&list_lrus_mutex);
+ return ret;
+fail:
+ list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+ memcg_cancel_update_list_lru(lru, old_size, new_size);
+ goto out;
+}
+
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+ int src_idx, int dst_idx)
+{
+ struct list_lru_one *src, *dst;
+
+ /*
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+
+ src = list_lru_from_memcg_idx(nlru, src_idx);
+ dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+ list_splice_init(&src->list, &dst->list);
+ dst->nr_items += src->nr_items;
+ src->nr_items = 0;
+
+ spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+ int src_idx, int dst_idx)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+ struct list_lru *lru;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list)
+ memcg_drain_list_lru(lru, src_idx, dst_idx);
+ mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+ struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
+ int err = -ENOMEM;
+
+ memcg_get_cache_ids();
lru->node = kzalloc(size, GFP_KERNEL);
if (!lru->node)
- return -ENOMEM;
+ goto out;
- nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
- INIT_LIST_HEAD(&lru->node[i].list);
- lru->node[i].nr_items = 0;
+ init_one_lru(&lru->node[i].lru);
}
- return 0;
+
+ err = memcg_init_list_lru(lru, memcg_aware);
+ if (err) {
+ kfree(lru->node);
+ goto out;
+ }
+
+ list_lru_register(lru);
+out:
+ memcg_put_cache_ids();
+ return err;
}
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
void list_lru_destroy(struct list_lru *lru)
{
+ /* Already destroyed or not yet initialized? */
+ if (!lru->node)
+ return;
+
+ memcg_get_cache_ids();
+
+ list_lru_unregister(lru);
+
+ memcg_destroy_list_lru(lru);
kfree(lru->node);
+ lru->node = NULL;
+
+ memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 095c1f96fbec..d18d3a6e7337 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -332,8 +332,10 @@ struct mem_cgroup {
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* Index in the kmem_cache->memcg_params->memcg_caches array */
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
+ bool kmem_acct_activated;
+ bool kmem_acct_active;
#endif
int last_scanned_node;
@@ -352,9 +354,9 @@ struct mem_cgroup {
};
#ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
- return memcg->kmemcg_id >= 0;
+ return memcg->kmem_acct_active;
}
#endif
@@ -517,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(tcp_proto_cgroup);
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
- if (!memcg_proto_activated(&memcg->tcp_mem))
- return;
- static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
#endif
#ifdef CONFIG_MEMCG_KMEM
/*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
*
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size. It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
*/
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+ down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+ up_read(&memcg_cache_ids_sem);
+}
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
@@ -569,32 +573,8 @@ int memcg_limited_groups_array_size;
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-static void memcg_free_cache_id(int id);
-
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
- if (memcg_kmem_is_active(memcg)) {
- static_key_slow_dec(&memcg_kmem_enabled_key);
- memcg_free_cache_id(memcg->kmemcg_id);
- }
- /*
- * This check can't live in kmem destruction function,
- * since the charges will outlive the cgroup
- */
- WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
- disarm_sock_keys(memcg);
- disarm_kmem_keys(memcg);
-}
-
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
@@ -2538,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
int id, size;
int err;
- id = ida_simple_get(&kmem_limited_groups,
+ id = ida_simple_get(&memcg_cache_ida,
0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
if (id < 0)
return id;
- if (id < memcg_limited_groups_array_size)
+ if (id < memcg_nr_cache_ids)
return id;
/*
* There's no space for the new id in memcg_caches arrays,
* so we have to grow them.
*/
+ down_write(&memcg_cache_ids_sem);
size = 2 * (id + 1);
if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2558,8 +2539,15 @@ static int memcg_alloc_cache_id(void)
size = MEMCG_CACHES_MAX_SIZE;
err = memcg_update_all_caches(size);
+ if (!err)
+ err = memcg_update_all_list_lrus(size);
+ if (!err)
+ memcg_nr_cache_ids = size;
+
+ up_write(&memcg_cache_ids_sem);
+
if (err) {
- ida_simple_remove(&kmem_limited_groups, id);
+ ida_simple_remove(&memcg_cache_ida, id);
return err;
}
return id;
@@ -2567,17 +2555,7 @@ static int memcg_alloc_cache_id(void)
static void memcg_free_cache_id(int id)
{
- ida_simple_remove(&kmem_limited_groups, id);
-}
-
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
- memcg_limited_groups_array_size = num;
+ ida_simple_remove(&memcg_cache_ida, id);
}
struct memcg_kmem_cache_create_work {
@@ -2656,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ int kmemcg_id;
- VM_BUG_ON(!cachep->memcg_params);
- VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(cachep));
if (current->memcg_kmem_skip_account)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- if (!memcg_kmem_is_active(memcg))
+ kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ if (kmemcg_id < 0)
goto out;
- memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
if (likely(memcg_cachep))
return memcg_cachep;
@@ -2692,7 +2671,7 @@ out:
void __memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params->memcg->css);
+ css_put(&cachep->memcg_params.memcg->css);
}
/*
@@ -2757,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, 1 << order);
page->mem_cgroup = NULL;
}
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+ if (PageSlab(page)) {
+ cachep = page->slab_cache;
+ if (!is_root_cache(cachep))
+ memcg = cachep->memcg_params.memcg;
+ } else
+ /* page allocated by alloc_kmem_pages */
+ memcg = page->mem_cgroup;
+
+ return memcg;
+}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3291,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
int err = 0;
int memcg_id;
- if (memcg_kmem_is_active(memcg))
- return 0;
+ BUG_ON(memcg->kmemcg_id >= 0);
+ BUG_ON(memcg->kmem_acct_activated);
+ BUG_ON(memcg->kmem_acct_active);
/*
* For simplicity, we won't allow this to be disabled. It also can't
@@ -3335,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
* patched.
*/
memcg->kmemcg_id = memcg_id;
+ memcg->kmem_acct_activated = true;
+ memcg->kmem_acct_active = true;
out:
return err;
}
@@ -4014,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return mem_cgroup_sockets_init(memcg, ss);
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (!memcg->kmem_acct_active)
+ return;
+
+ /*
+ * Clear the 'active' flag before clearing memcg_caches arrays entries.
+ * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+ * guarantees no cache will be created for this cgroup after we are
+ * done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_acct_active = false;
+
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /*
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
+ */
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
- memcg_destroy_kmem_caches(memcg);
+ if (memcg->kmem_acct_activated) {
+ memcg_destroy_kmem_caches(memcg);
+ static_key_slow_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
mem_cgroup_sockets_destroy(memcg);
}
#else
@@ -4025,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return 0;
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
}
@@ -4443,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
-
- disarm_static_keys(memcg);
kfree(memcg);
}
@@ -4581,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
spin_unlock(&memcg->event_list_lock);
vmpressure_cleanup(&memcg->vmpressure);
+
+ memcg_deactivate_kmem(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..d487f8dc6d39 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
* Only call shrink_node_slabs here (which would also shrink
* other caches) if access is not potentially fatal.
*/
- if (access) {
- int nr;
- int nid = page_to_nid(p);
- do {
- nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
- if (page_count(p) == 1)
- break;
- } while (nr > 10);
- }
+ if (access)
+ drop_slab_node(page_to_nid(p));
}
EXPORT_SYMBOL_GPL(shake_page);
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags)
* setting PG_hwpoison.
*/
if (!is_free_buddy_page(page))
- lru_add_drain_all();
- if (!is_free_buddy_page(page))
drain_all_pages(page_zone(page));
SetPageHWPoison(page);
if (!is_free_buddy_page(page))
diff --git a/mm/memory.c b/mm/memory.c
index bbe6a73a899d..99275325f303 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3013,14 +3013,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool migrated = false;
int flags = 0;
+ /* A PROT_NONE fault should not end up here */
+ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*
- * ptep_modify_prot_start is not called as this is clearing
- * the _PAGE_NUMA bit and it is not really expected that there
- * would be concurrent hardware modifications to the PTE.
+ * We can safely just do a "set_pte_at()", because the old
+ * page table entry is not accessible, so there would be no
+ * concurrent hardware modifications to the PTE.
*/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
@@ -3029,7 +3032,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
}
- pte = pte_mknonnuma(pte);
+ /* Make it present again */
+ pte = pte_modify(pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep);
@@ -3038,7 +3043,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return 0;
}
- BUG_ON(is_zero_pfn(page_to_pfn(page)));
/*
* Avoid grouping on DSO/COW pages in specific and RO pages
@@ -3124,7 +3128,7 @@ static int handle_pte_fault(struct mm_struct *mm,
pte, pmd, flags, entry);
}
- if (pte_numa(entry))
+ if (pte_protnone(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
ptl = pte_lockptr(mm, pmd);
@@ -3202,7 +3206,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (pmd_trans_splitting(orig_pmd))
return 0;
- if (pmd_numa(orig_pmd))
+ if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
@@ -3458,7 +3462,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
if (follow_phys(vma, addr, write, &prot, &phys_addr))
return -EINVAL;
- maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+ maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (write)
memcpy_toio(maddr + offset, buf, len);
else
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1bd23803576..c75f4dcec808 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
{
int nr_updated;
- nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
diff --git a/mm/migrate.c b/mm/migrate.c
index f98067e5d353..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
return PageLocked(page);
}
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
- struct page *page = pmd_page(*pmd);
- wait_on_page_locked(page);
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -1853,7 +1847,7 @@ out_fail:
out_dropref:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
- entry = pmd_mknonnuma(entry);
+ entry = pmd_modify(entry, vma->vm_page_prot);
set_pmd_at(mm, mmun_start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf9936b..5f420f7fafa1 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,14 +14,14 @@
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
-int mminit_loglevel;
+int __meminitdata mminit_loglevel;
#ifndef SECTIONS_SHIFT
#define SECTIONS_SHIFT 0
#endif
/* The zonelists are simply reported, validation is manual. */
-void mminit_verify_zonelist(void)
+void __init mminit_verify_zonelist(void)
{
int nid;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33121662f08b..44727811bf4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,36 +75,34 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
- bool updated = false;
- if (!prot_numa) {
- ptent = ptep_modify_prot_start(mm, addr, pte);
- if (pte_numa(ptent))
- ptent = pte_mknonnuma(ptent);
- ptent = pte_modify(ptent, newprot);
- /*
- * Avoid taking write faults for pages we
- * know to be dirty.
- */
- if (dirty_accountable && pte_dirty(ptent) &&
- (pte_soft_dirty(ptent) ||
- !(vma->vm_flags & VM_SOFTDIRTY)))
- ptent = pte_mkwrite(ptent);
- ptep_modify_prot_commit(mm, addr, pte, ptent);
- updated = true;
- } else {
+ /*
+ * Avoid trapping faults against the zero or KSM
+ * pages. See similar comment in change_huge_pmd.
+ */
+ if (prot_numa) {
struct page *page;
page = vm_normal_page(vma, addr, oldpte);
- if (page && !PageKsm(page)) {
- if (!pte_numa(oldpte)) {
- ptep_set_numa(mm, addr, pte);
- updated = true;
- }
- }
+ if (!page || PageKsm(page))
+ continue;
+
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ continue;
}
- if (updated)
- pages++;
+
+ ptent = ptep_modify_prot_start(mm, addr, pte);
+ ptent = pte_modify(ptent, newprot);
+
+ /* Avoid taking write faults for known dirty pages */
+ if (dirty_accountable && pte_dirty(ptent) &&
+ (pte_soft_dirty(ptent) ||
+ !(vma->vm_flags & VM_SOFTDIRTY))) {
+ ptent = pte_mkwrite(ptent);
+ }
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
+ pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d52ab18fe0d..cb4758263f6b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -172,7 +172,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
- * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
@@ -3871,18 +3871,29 @@ static int __build_all_zonelists(void *data)
return 0;
}
+static noinline void __init
+build_all_zonelists_init(void)
+{
+ __build_all_zonelists(NULL);
+ mminit_verify_zonelist();
+ cpuset_init_current_mems_allowed();
+}
+
/*
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
+ *
+ * __ref due to (1) call of __meminit annotated setup_zone_pageset
+ * [we're only called with non-NULL zone through __meminit paths] and
+ * (2) call of __init annotated helper build_all_zonelists_init
+ * [protected by SYSTEM_BOOTING].
*/
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order();
if (system_state == SYSTEM_BOOTING) {
- __build_all_zonelists(NULL);
- mminit_verify_zonelist();
- cpuset_init_current_mems_allowed();
+ build_all_zonelists_init();
} else {
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e028ecb..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
pmd_t entry = *pmdp;
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..c4b89eaf4c96 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2382,7 +2382,7 @@ out:
return nr_freed;
}
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
{
int ret = 0;
int node;
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
int i;
struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep);
+ int rc = __kmem_cache_shrink(cachep, false);
if (rc)
return rc;
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
int ret;
- struct kmem_cache *c = NULL;
- int i = 0;
+ struct kmem_cache *c;
ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
if ((ret < 0) || !is_root_cache(cachep))
return ret;
- VM_BUG_ON(!mutex_is_locked(&slab_mutex));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(cachep, i);
- if (c)
- /* return value determined by the parent cache only */
- __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+ lockdep_assert_held(&slab_mutex);
+ for_each_memcg_cache(c, cachep) {
+ /* return value determined by the root cache only */
+ __do_tune_cpucache(c, limit, batchcount, shared, gfp);
}
return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 90430d6f665e..4c3ac12dd644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
extern void create_boot_cache(struct kmem_cache *, const char *name,
size_t size, unsigned long flags);
-struct mem_cgroup;
-
int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(size_t size, size_t align,
unsigned long flags, const char *name, void (*ctor)(void *));
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+ list_for_each_entry(iter, &(root)->memcg_params.list, \
+ memcg_params.list)
+
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+ list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+ memcg_params.list)
+
static inline bool is_root_cache(struct kmem_cache *s)
{
- return !s->memcg_params || s->memcg_params->is_root_cache;
+ return s->memcg_params.is_root_cache;
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
+ struct kmem_cache *p)
{
- return (p == s) ||
- (s->memcg_params && (p == s->memcg_params->root_cache));
+ return p == s || p == s->memcg_params.root_cache;
}
/*
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
static inline const char *cache_name(struct kmem_cache *s)
{
if (!is_root_cache(s))
- return s->memcg_params->root_cache->name;
+ s = s->memcg_params.root_cache;
return s->name;
}
/*
* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
- * created a memcg's cache is destroyed only along with the root cache, it is
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
+ * That said the caller must assure the memcg's cache won't go away by either
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
*/
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
struct kmem_cache *cachep;
- struct memcg_cache_params *params;
-
- if (!s->memcg_params)
- return NULL;
+ struct memcg_cache_array *arr;
rcu_read_lock();
- params = rcu_dereference(s->memcg_params);
+ arr = rcu_dereference(s->memcg_params.memcg_caches);
/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match this (see
- * memcg_register_cache()).
+ * memcg_create_kmem_cache()).
*/
- cachep = lockless_dereference(params->memcg_caches[idx]);
+ cachep = lockless_dereference(arr->entries[idx]);
rcu_read_unlock();
return cachep;
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
if (is_root_cache(s))
return s;
- return s->memcg_params->root_cache;
+ return s->memcg_params.root_cache;
}
static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
return 0;
if (is_root_cache(s))
return 0;
- return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
+ return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
}
static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
return;
if (is_root_cache(s))
return;
- memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
+ memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
}
-#else
+
+extern void slab_init_memcg_params(struct kmem_cache *);
+
+#else /* !CONFIG_MEMCG_KMEM */
+
+#define for_each_memcg_cache(iter, root) \
+ for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+ for ((void)(iter), (void)(tmp), (void)(root); 0; )
+
static inline bool is_root_cache(struct kmem_cache *s)
{
return true;
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
{
}
-#endif
+
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6e1e4cf65836..1a1cc89acaa3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
#endif
#ifdef CONFIG_MEMCG_KMEM
-static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
- struct kmem_cache *s, struct kmem_cache *root_cache)
+void slab_init_memcg_params(struct kmem_cache *s)
{
- size_t size;
+ s->memcg_params.is_root_cache = true;
+ INIT_LIST_HEAD(&s->memcg_params.list);
+ RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+
+static int init_memcg_params(struct kmem_cache *s,
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct memcg_cache_array *arr;
- if (!memcg_kmem_enabled())
+ if (memcg) {
+ s->memcg_params.is_root_cache = false;
+ s->memcg_params.memcg = memcg;
+ s->memcg_params.root_cache = root_cache;
return 0;
+ }
- if (!memcg) {
- size = offsetof(struct memcg_cache_params, memcg_caches);
- size += memcg_limited_groups_array_size * sizeof(void *);
- } else
- size = sizeof(struct memcg_cache_params);
+ slab_init_memcg_params(s);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params)
- return -ENOMEM;
+ if (!memcg_nr_cache_ids)
+ return 0;
- if (memcg) {
- s->memcg_params->memcg = memcg;
- s->memcg_params->root_cache = root_cache;
- } else
- s->memcg_params->is_root_cache = true;
+ arr = kzalloc(sizeof(struct memcg_cache_array) +
+ memcg_nr_cache_ids * sizeof(void *),
+ GFP_KERNEL);
+ if (!arr)
+ return -ENOMEM;
+ RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
return 0;
}
-static void memcg_free_cache_params(struct kmem_cache *s)
+static void destroy_memcg_params(struct kmem_cache *s)
{
- kfree(s->memcg_params);
+ if (is_root_cache(s))
+ kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
}
-static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
{
- int size;
- struct memcg_cache_params *new_params, *cur_params;
-
- BUG_ON(!is_root_cache(s));
+ struct memcg_cache_array *old, *new;
- size = offsetof(struct memcg_cache_params, memcg_caches);
- size += num_memcgs * sizeof(void *);
+ if (!is_root_cache(s))
+ return 0;
- new_params = kzalloc(size, GFP_KERNEL);
- if (!new_params)
+ new = kzalloc(sizeof(struct memcg_cache_array) +
+ new_array_size * sizeof(void *), GFP_KERNEL);
+ if (!new)
return -ENOMEM;
- cur_params = s->memcg_params;
- memcpy(new_params->memcg_caches, cur_params->memcg_caches,
- memcg_limited_groups_array_size * sizeof(void *));
-
- new_params->is_root_cache = true;
-
- rcu_assign_pointer(s->memcg_params, new_params);
- if (cur_params)
- kfree_rcu(cur_params, rcu_head);
+ old = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ if (old)
+ memcpy(new->entries, old->entries,
+ memcg_nr_cache_ids * sizeof(void *));
+ rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs)
{
struct kmem_cache *s;
int ret = 0;
- mutex_lock(&slab_mutex);
+ mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
- if (!is_root_cache(s))
- continue;
-
- ret = memcg_update_cache_params(s, num_memcgs);
+ ret = update_memcg_params(s, num_memcgs);
/*
* Instead of freeing the memory, we'll just leave the caches
* up to this point in an updated state.
*/
if (ret)
- goto out;
+ break;
}
-
- memcg_update_array_size(num_memcgs);
-out:
mutex_unlock(&slab_mutex);
return ret;
}
#else
-static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
- struct kmem_cache *s, struct kmem_cache *root_cache)
+static inline int init_memcg_params(struct kmem_cache *s,
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
return 0;
}
-static inline void memcg_free_cache_params(struct kmem_cache *s)
+static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
s->align = align;
s->ctor = ctor;
- err = memcg_alloc_cache_params(memcg, s, root_cache);
+ err = init_memcg_params(s, memcg, root_cache);
if (err)
goto out_free_cache;
@@ -330,7 +329,7 @@ out:
return s;
out_free_cache:
- memcg_free_cache_params(s);
+ destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
@@ -369,6 +368,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
get_online_cpus();
get_online_mems();
+ memcg_get_cache_ids();
mutex_lock(&slab_mutex);
@@ -407,6 +407,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
out_unlock:
mutex_unlock(&slab_mutex);
+ memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
@@ -439,13 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
*need_rcu_barrier = true;
#ifdef CONFIG_MEMCG_KMEM
- if (!is_root_cache(s)) {
- struct kmem_cache *root_cache = s->memcg_params->root_cache;
- int memcg_id = memcg_cache_id(s->memcg_params->memcg);
-
- BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
- root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
- }
+ if (!is_root_cache(s))
+ list_del(&s->memcg_params.list);
#endif
list_move(&s->list, release);
return 0;
@@ -482,9 +478,11 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *root_cache)
{
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- int memcg_id = memcg_cache_id(memcg);
+ struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+ struct memcg_cache_array *arr;
struct kmem_cache *s = NULL;
char *cache_name;
+ int idx;
get_online_cpus();
get_online_mems();
@@ -492,17 +490,27 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&slab_mutex);
/*
+ * The memory cgroup could have been deactivated while the cache
+ * creation work was pending.
+ */
+ if (!memcg_kmem_is_active(memcg))
+ goto out_unlock;
+
+ idx = memcg_cache_id(memcg);
+ arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+
+ /*
* Since per-memcg caches are created asynchronously on first
* allocation (see memcg_kmem_get_cache()), several threads can try to
* create the same cache, but only one of them may succeed.
*/
- if (cache_from_memcg_idx(root_cache, memcg_id))
+ if (arr->entries[idx])
goto out_unlock;
- cgroup_name(mem_cgroup_css(memcg)->cgroup,
- memcg_name_buf, sizeof(memcg_name_buf));
+ cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- memcg_cache_id(memcg), memcg_name_buf);
+ css->id, memcg_name_buf);
if (!cache_name)
goto out_unlock;
@@ -520,13 +528,15 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
goto out_unlock;
}
+ list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
* initialized.
*/
smp_wmb();
- root_cache->memcg_params->memcg_caches[memcg_id] = s;
+ arr->entries[idx] = s;
out_unlock:
mutex_unlock(&slab_mutex);
@@ -535,6 +545,37 @@ out_unlock:
put_online_cpus();
}
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+{
+ int idx;
+ struct memcg_cache_array *arr;
+ struct kmem_cache *s, *c;
+
+ idx = memcg_cache_id(memcg);
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!is_root_cache(s))
+ continue;
+
+ arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ c = arr->entries[idx];
+ if (!c)
+ continue;
+
+ __kmem_cache_shrink(c, true);
+ arr->entries[idx] = NULL;
+ }
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+}
+
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
LIST_HEAD(release);
@@ -546,7 +587,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
mutex_lock(&slab_mutex);
list_for_each_entry_safe(s, s2, &slab_caches, list) {
- if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+ if (is_root_cache(s) || s->memcg_params.memcg != memcg)
continue;
/*
* The cgroup is about to be freed and therefore has no charges
@@ -565,18 +606,20 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
void slab_kmem_cache_release(struct kmem_cache *s)
{
- memcg_free_cache_params(s);
+ destroy_memcg_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
}
void kmem_cache_destroy(struct kmem_cache *s)
{
- int i;
+ struct kmem_cache *c, *c2;
LIST_HEAD(release);
bool need_rcu_barrier = false;
bool busy = false;
+ BUG_ON(!is_root_cache(s));
+
get_online_cpus();
get_online_mems();
@@ -586,10 +629,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
-
- if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+ for_each_memcg_cache_safe(c, c2, s) {
+ if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
busy = true;
}
@@ -619,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
- ret = __kmem_cache_shrink(cachep);
+ ret = __kmem_cache_shrink(cachep, false);
put_online_mems();
put_online_cpus();
return ret;
@@ -641,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
s->name = name;
s->size = s->object_size = size;
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ slab_init_memcg_params(s);
+
err = __kmem_cache_create(s, flags);
if (err)
@@ -920,16 +964,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
{
struct kmem_cache *c;
struct slabinfo sinfo;
- int i;
if (!is_root_cache(s))
return;
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
+ for_each_memcg_cache(c, s) {
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(c, &sinfo);
@@ -981,7 +1020,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
if (p == slab_caches.next)
print_slabinfo_header(m);
- if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+ if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
cache_show(s, m);
return 0;
}
diff --git a/mm/slob.c b/mm/slob.c
index 96a86206a26b..94a7fede6d48 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
}
diff --git a/mm/slub.c b/mm/slub.c
index 8b8508adf9c2..06cdb1829dc9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2007,6 +2007,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
int pages;
int pobjects;
+ preempt_disable();
do {
pages = 0;
pobjects = 0;
@@ -2040,6 +2041,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
+ if (unlikely(!s->cpu_partial)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ }
+ preempt_enable();
#endif
}
@@ -3358,69 +3367,92 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
+
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
*
* The slabs with the least items are placed last. This results in them
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
{
int node;
int i;
struct kmem_cache_node *n;
struct page *page;
struct page *t;
- int objects = oo_objects(s->max);
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ struct list_head discard;
+ struct list_head promote[SHRINK_PROMOTE_MAX];
unsigned long flags;
+ int ret = 0;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ if (deactivate) {
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial),
+ * so we have to make sure the change is visible.
+ */
+ kick_all_cpus_sync();
+ }
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
- if (!n->nr_partial)
- continue;
-
- for (i = 0; i < objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ INIT_LIST_HEAD(&discard);
+ for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+ INIT_LIST_HEAD(promote + i);
spin_lock_irqsave(&n->list_lock, flags);
/*
- * Build lists indexed by the items in use in each slab.
+ * Build lists of slabs to discard or promote.
*
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- list_move(&page->lru, slabs_by_inuse + page->inuse);
- if (!page->inuse)
+ int free = page->objects - page->inuse;
+
+ /* Do not reread page->inuse */
+ barrier();
+
+ /* We do not keep full slabs on the list */
+ BUG_ON(free <= 0);
+
+ if (free == page->objects) {
+ list_move(&page->lru, &discard);
n->nr_partial--;
+ } else if (free <= SHRINK_PROMOTE_MAX)
+ list_move(&page->lru, promote + free - 1);
}
/*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
+ * Promote the slabs filled up most to the head of the
+ * partial list.
*/
- for (i = objects - 1; i > 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+ list_splice(promote + i, &n->partial);
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ list_for_each_entry_safe(page, t, &discard, lru)
discard_slab(s, page);
+
+ if (slabs_node(s, node))
+ ret = 1;
}
- kfree(slabs_by_inuse);
- return 0;
+ return ret;
}
static int slab_mem_going_offline_callback(void *arg)
@@ -3429,7 +3461,7 @@ static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ __kmem_cache_shrink(s, false);
mutex_unlock(&slab_mutex);
return 0;
@@ -3577,6 +3609,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
p->slab_cache = s;
#endif
}
+ slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
return s;
}
@@ -3635,13 +3668,10 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s;
+ struct kmem_cache *s, *c;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
- int i;
- struct kmem_cache *c;
-
s->refcount++;
/*
@@ -3651,10 +3681,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
+ for_each_memcg_cache(c, s) {
c->object_size = s->object_size;
c->inuse = max_t(int, c->inuse,
ALIGN(size, sizeof(void *)));
@@ -4691,12 +4718,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- if (buf[0] == '1') {
- int rc = kmem_cache_shrink(s);
-
- if (rc)
- return rc;
- } else
+ if (buf[0] == '1')
+ kmem_cache_shrink(s);
+ else
return -EINVAL;
return length;
}
@@ -4920,7 +4944,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
err = attribute->store(s, buf, len);
#ifdef CONFIG_MEMCG_KMEM
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- int i;
+ struct kmem_cache *c;
mutex_lock(&slab_mutex);
if (s->max_attr_size < len)
@@ -4943,11 +4967,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* directly either failed or succeeded, in which case we loop
* through the descendants with best-effort propagation.
*/
- for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
- if (c)
- attribute->store(c, buf, len);
- }
+ for_each_memcg_cache(c, s)
+ attribute->store(c, buf, len);
mutex_unlock(&slab_mutex);
}
#endif
@@ -4964,7 +4985,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
if (is_root_cache(s))
return;
- root_cache = s->memcg_params->root_cache;
+ root_cache = s->memcg_params.root_cache;
/*
* This mean this cache had no attribute written. Therefore, no point
@@ -5044,7 +5065,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
if (!is_root_cache(s))
- return s->memcg_params->root_cache->memcg_kset;
+ return s->memcg_params.root_cache->memcg_kset;
#endif
return slab_kset;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 224dd298fdcd..5e8eadd71bac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
-static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
- struct shrinker *shrinker,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+ struct shrinker *shrinker,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
unsigned long freed = 0;
unsigned long long delta;
@@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
}
/**
- * shrink_node_slabs - shrink slab caches of a given node
+ * shrink_slab - shrink slab caches
* @gfp_mask: allocation context
* @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
* @nr_scanned: pressure numerator
* @nr_eligible: pressure denominator
*
@@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
+ *
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
* passes the number of pages scanned and the number of pages on the
@@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
*
* Returns the number of reclaimed slab objects.
*/
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
struct shrinker *shrinker;
unsigned long freed = 0;
+ if (memcg && !memcg_kmem_is_active(memcg))
+ return 0;
+
if (nr_scanned == 0)
nr_scanned = SWAP_CLUSTER_MAX;
@@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
+ .memcg = memcg,
};
+ if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ continue;
+
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
sc.nid = 0;
- freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
+ freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
}
up_read(&shrinker_rwsem);
@@ -404,6 +419,29 @@ out:
return freed;
}
+void drop_slab_node(int nid)
+{
+ unsigned long freed;
+
+ do {
+ struct mem_cgroup *memcg = NULL;
+
+ freed = 0;
+ do {
+ freed += shrink_slab(GFP_KERNEL, nid, memcg,
+ 1000, 1000);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+ } while (freed > 10);
+}
+
+void drop_slab(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ drop_slab_node(nid);
+}
+
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
static bool shrink_zone(struct zone *zone, struct scan_control *sc,
bool is_classzone)
{
+ struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
@@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
+ unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
@@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
+ scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
zone_lru_pages += lru_pages;
+ if (memcg && is_classzone)
+ shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+ memcg, sc->nr_scanned - scanned,
+ lru_pages);
+
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
@@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
* Shrink the slab caches in the same proportion that
* the eligible LRU pages were scanned.
*/
- if (global_reclaim(sc) && is_classzone) {
- struct reclaim_state *reclaim_state;
-
- shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
- sc->nr_scanned - nr_scanned,
- zone_lru_pages);
-
- reclaim_state = current->reclaim_state;
- if (reclaim_state) {
- sc->nr_reclaimed +=
- reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
+ if (global_reclaim(sc) && is_classzone)
+ shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+ sc->nr_scanned - nr_scanned,
+ zone_lru_pages);
+
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
}
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+ shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
pages = node_present_pages(sc->nid);
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
}
static enum lru_status shadow_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
{
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
- list_del_init(item);
+ list_lru_isolate(lru, item);
spin_unlock(lru_lock);
/*
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
- shadow_lru_isolate, NULL, &sc->nr_to_scan);
+ ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
+ shadow_lru_isolate, NULL);
local_irq_enable();
return ret;
}
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+ struct zpool_ops *zpool_ops)
{
return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
}
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_create_pool() - Create a new zpool
* @type The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name The name of the zpool (e.g. zram0, zswap)
* @gfp The GFP flags to use when allocating the pool.
* @ops The optional ops callback.
*
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+ struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
zpool->type = driver->type;
zpool->driver = driver;
- zpool->pool = driver->create(gfp, ops);
+ zpool->pool = driver->create(name, gfp, ops);
zpool->ops = ops;
if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..0dec1fa5f656 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
#include <linux/hardirq.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
@@ -168,6 +169,22 @@ enum fullness_group {
ZS_FULL
};
+enum zs_stat_type {
+ OBJ_ALLOCATED,
+ OBJ_USED,
+ NR_ZS_STAT_TYPE,
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static struct dentry *zs_stat_root;
+
+struct zs_size_stat {
+ unsigned long objs[NR_ZS_STAT_TYPE];
+};
+
+#endif
+
/*
* number of size_classes
*/
@@ -200,6 +217,10 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct zs_size_stat stats;
+#endif
+
spinlock_t lock;
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
};
struct zs_pool {
+ char *name;
+
struct size_class **size_class;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct dentry *stat_dentry;
+#endif
};
/*
@@ -246,9 +273,9 @@ struct mapping_area {
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
{
- return zs_create_pool(gfp);
+ return zs_create_pool(name, gfp);
}
static void zs_zpool_destroy(void *pool)
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
+ "obj_allocated", "obj_used", "pages_used");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
+ class->size, obj_allocated, obj_used, pages_used);
+
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
+ total_objs, total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "obj_in_classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated);
+
spin_lock(&class->lock);
+ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
}
obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
kunmap_atomic(vaddr);
first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(pool, first_page);
spin_unlock(&class->lock);
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
first_page->inuse--;
fullness = fix_fullness_group(pool, first_page);
+
+ zs_stat_dec(class, OBJ_USED, 1);
+ if (fullness == ZS_EMPTY)
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+
spin_unlock(&class->lock);
if (fullness == ZS_EMPTY) {
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free);
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(gfp_t flags)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
{
int i;
struct zs_pool *pool;
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags)
if (!pool)
return NULL;
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name) {
+ kfree(pool);
+ return NULL;
+ }
+
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
+ kfree(pool->name);
kfree(pool);
return NULL;
}
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags)
pool->flags = flags;
+ if (zs_pool_stat_create(name, pool))
+ goto err;
+
return pool;
err:
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool)
{
int i;
+ zs_pool_stat_destroy(pool);
+
for (i = 0; i < zs_size_classes; i++) {
int fg;
struct size_class *class = pool->size_class[i];
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool)
}
kfree(pool->size_class);
+ kfree(pool->name);
kfree(pool);
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1460,30 @@ static int __init zs_init(void)
{
int ret = zs_register_cpu_notifier();
- if (ret) {
- zs_unregister_cpu_notifier();
- return ret;
- }
+ if (ret)
+ goto notifier_fail;
init_zs_size_classes();
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif
+
+ ret = zs_stat_init();
+ if (ret) {
+ pr_err("zs stat initialization failed\n");
+ goto stat_fail;
+ }
return 0;
+
+stat_fail:
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+ zs_unregister_cpu_notifier();
+
+ return ret;
}
static void __exit zs_exit(void)
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void)
zpool_unregister_driver(&zs_zpool_driver);
#endif
zs_unregister_cpu_notifier();
+
+ zs_stat_exit();
}
module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
pr_info("loading zswap\n");
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+ &zswap_zpool_ops);
if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
pr_info("%s zpool not available\n", zswap_zpool_type);
zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
&zswap_zpool_ops);
}
if (!zswap_pool) {
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index c2a75c6957a1..2379c1b4efb2 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -47,6 +47,10 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
return;
percpu_counter_destroy(&cg_proto->sockets_allocated);
+
+ if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_dec(&memcg_socket_limit_enabled);
+
}
EXPORT_SYMBOL(tcp_destroy_cgroup);