From e5671dfae59b165e2adfd4dfbdeab11ac8db5bda Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:01 +0000 Subject: Basic kernel memory functionality for the Memory Controller This patch lays down the foundation for the kernel memory component of the Memory Controller. As of today, I am only laying down the following files: * memory.independent_kmem_limit * memory.kmem.limit_in_bytes (currently ignored) * memory.kmem.usage_in_bytes (always zero) Signed-off-by: Glauber Costa CC: Kirill A. Shutemov CC: Paul Menage CC: Greg Thelen CC: Johannes Weiner CC: Michal Hocko Signed-off-by: David S. Miller --- mm/memcontrol.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aff93c98aca..9fbcff71245e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -226,6 +226,10 @@ struct mem_cgroup { * the counter to account for mem+swap usage. */ struct res_counter memsw; + /* + * the counter to account for kmem usage. + */ + struct res_counter kmem; /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -276,6 +280,11 @@ struct mem_cgroup { * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; + /* + * Should kernel memory limits be stabilished independently + * from user memory ? + */ + int kmem_independent_accounting; /* * percpu counter. */ @@ -344,9 +353,14 @@ enum charge_type { }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) + +enum mem_type { + _MEM = 0, + _MEMSWAP, + _OOM_TYPE, + _KMEM, +}; + #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -3848,10 +3862,17 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) u64 val; if (!mem_cgroup_is_root(memcg)) { + val = 0; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM + if (!memcg->kmem_independent_accounting) + val = res_counter_read_u64(&memcg->kmem, RES_USAGE); +#endif if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); + val += res_counter_read_u64(&memcg->res, RES_USAGE); else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); + val += res_counter_read_u64(&memcg->memsw, RES_USAGE); + + return val; } val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); @@ -3884,6 +3905,11 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) else val = res_counter_read_u64(&memcg->memsw, name); break; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; +#endif default: BUG(); break; @@ -4612,6 +4638,69 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +static u64 kmem_limit_independent_read(struct cgroup *cgroup, struct cftype *cft) +{ + return mem_cgroup_from_cont(cgroup)->kmem_independent_accounting; +} + +static int kmem_limit_independent_write(struct cgroup *cgroup, struct cftype *cft, + u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + + val = !!val; + + /* + * This follows the same hierarchy restrictions than + * mem_cgroup_hierarchy_write() + */ + if (!parent || !parent->use_hierarchy) { + if (list_empty(&cgroup->children)) + memcg->kmem_independent_accounting = val; + else + return -EBUSY; + } + else + return -EINVAL; + + return 0; +} +static struct cftype kmem_cgroup_files[] = { + { + .name = "independent_kmem_limit", + .read_u64 = kmem_limit_independent_read, + .write_u64 = kmem_limit_independent_write, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read_u64 = mem_cgroup_read, + }, + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .read_u64 = mem_cgroup_read, + }, +}; + +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + int ret = 0; + + ret = cgroup_add_files(cont, ss, kmem_cgroup_files, + ARRAY_SIZE(kmem_cgroup_files)); + return ret; +}; + +#else +static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +{ + return 0; +} +#endif + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4925,6 +5014,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -4935,6 +5025,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); } memcg->last_scanned_child = 0; memcg->last_scanned_node = MAX_NUMNODES; @@ -4978,6 +5069,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, if (!ret) ret = register_memsw_files(cont, ss); + + if (!ret) + ret = register_kmem_files(cont, ss); + return ret; } -- cgit v1.2.3 From e1aab161e0135aafcd439be20b4f35e4b0922d95 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:03 +0000 Subject: socket: initial cgroup code. The goal of this work is to move the memory pressure tcp controls to a cgroup, instead of just relying on global conditions. To avoid excessive overhead in the network fast paths, the code that accounts allocated memory to a cgroup is hidden inside a static_branch(). This branch is patched out until the first non-root cgroup is created. So when nobody is using cgroups, even if it is mounted, no significant performance penalty should be seen. This patch handles the generic part of the code, and has nothing tcp-specific. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki CC: Kirill A. Shutemov CC: David S. Miller CC: Eric W. Biederman CC: Eric Dumazet Signed-off-by: David S. Miller --- mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9fbcff71245e..3de3901ae0a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -379,7 +379,48 @@ enum mem_type { static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); + +/* Writing them here to avoid exposing memcg's inner layout */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_INET +#include + +static bool mem_cgroup_is_root(struct mem_cgroup *memcg); +void sock_update_memcg(struct sock *sk) +{ + /* A socket spends its whole life in the same cgroup */ + if (sk->sk_cgrp) { + WARN_ON(1); + return; + } + if (static_branch(&memcg_socket_limit_enabled)) { + struct mem_cgroup *memcg; + + BUG_ON(!sk->sk_prot->proto_cgroup); + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (!mem_cgroup_is_root(memcg)) { + mem_cgroup_get(memcg); + sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); + } + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) { + struct mem_cgroup *memcg; + WARN_ON(!sk->sk_cgrp->memcg); + memcg = sk->sk_cgrp->memcg; + mem_cgroup_put(memcg); + } +} +#endif /* CONFIG_INET */ +#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -4932,12 +4973,13 @@ static void mem_cgroup_put(struct mem_cgroup *memcg) /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { if (!memcg->res.parent) return NULL; return mem_cgroup_from_res_counter(memcg->res.parent, res); } +EXPORT_SYMBOL(parent_mem_cgroup); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) -- cgit v1.2.3 From d1a4c0b37c296e600ffe08edb0db2dc1b8f550d7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:04 +0000 Subject: tcp memory pressure controls This patch introduces memory pressure controls for the tcp protocol. It uses the generic socket memory pressure code introduced in earlier patches, and fills in the necessary data in cg_proto struct. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki CC: Eric W. Biederman Signed-off-by: David S. Miller --- mm/memcontrol.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3de3901ae0a7..7266202fa7cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -50,6 +50,8 @@ #include #include #include "internal.h" +#include +#include #include @@ -295,6 +297,10 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + +#ifdef CONFIG_INET + struct tcp_memcontrol tcp_mem; +#endif }; /* Stuffs for move charges at task migration. */ @@ -384,6 +390,7 @@ static void mem_cgroup_put(struct mem_cgroup *memcg); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM #ifdef CONFIG_INET #include +#include static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) @@ -418,6 +425,15 @@ void sock_release_memcg(struct sock *sk) mem_cgroup_put(memcg); } } + +struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg || mem_cgroup_is_root(memcg)) + return NULL; + + return &memcg->tcp_mem.cg_proto; +} +EXPORT_SYMBOL(tcp_proto_cgroup); #endif /* CONFIG_INET */ #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ @@ -800,7 +816,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, mem_cgroup_subsys_id), struct mem_cgroup, @@ -4732,14 +4748,34 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) ret = cgroup_add_files(cont, ss, kmem_cgroup_files, ARRAY_SIZE(kmem_cgroup_files)); + + /* + * Part of this would be better living in a separate allocation + * function, leaving us with just the cgroup tree population work. + * We, however, depend on state such as network's proto_list that + * is only initialized after cgroup creation. I found the less + * cumbersome way to deal with it to defer it all to populate time + */ + if (!ret) + ret = mem_cgroup_sockets_init(cont, ss); return ret; }; +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + mem_cgroup_sockets_destroy(cont, ss); +} #else static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) { return 0; } + +static void kmem_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ +} #endif static struct cftype mem_cgroup_files[] = { @@ -5098,6 +5134,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_destroy(ss, cont); + mem_cgroup_put(memcg); } -- cgit v1.2.3 From 65c64ce8ee642eb330a4c4d94b664725f2902b44 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 22 Dec 2011 01:02:27 +0000 Subject: Partial revert "Basic kernel memory functionality for the Memory Controller" This reverts commit e5671dfae59b165e2adfd4dfbdeab11ac8db5bda. After a follow up discussion with Michal, it was agreed it would be better to leave the kmem controller with just the tcp files, deferring the behavior of the other general memory.kmem.* files for a later time, when more caches are controlled. This is because generic kmem files are not used by tcp accounting and it is not clear how other slab caches would fit into the scheme. We are reverting the original commit so we can track the reference. Part of the patch is kept, because it was used by the later tcp code. Conflicts are shown in the bottom. init/Kconfig is removed from the revert entirely. Signed-off-by: Glauber Costa Acked-by: Michal Hocko CC: Kirill A. Shutemov CC: Paul Menage CC: Greg Thelen CC: Johannes Weiner CC: David S. Miller Conflicts: Documentation/cgroups/memory.txt mm/memcontrol.c Signed-off-by: David S. Miller --- mm/memcontrol.c | 93 ++++----------------------------------------------------- 1 file changed, 6 insertions(+), 87 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7266202fa7cf..8cdc9156455c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -228,10 +228,6 @@ struct mem_cgroup { * the counter to account for mem+swap usage. */ struct res_counter memsw; - /* - * the counter to account for kmem usage. - */ - struct res_counter kmem; /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -282,11 +278,6 @@ struct mem_cgroup { * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; - /* - * Should kernel memory limits be stabilished independently - * from user memory ? - */ - int kmem_independent_accounting; /* * percpu counter. */ @@ -359,14 +350,9 @@ enum charge_type { }; /* for encoding cft->private value on file */ - -enum mem_type { - _MEM = 0, - _MEMSWAP, - _OOM_TYPE, - _KMEM, -}; - +#define _MEM (0) +#define _MEMSWAP (1) +#define _OOM_TYPE (2) #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -3919,17 +3905,10 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) u64 val; if (!mem_cgroup_is_root(memcg)) { - val = 0; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM - if (!memcg->kmem_independent_accounting) - val = res_counter_read_u64(&memcg->kmem, RES_USAGE); -#endif if (!swap) - val += res_counter_read_u64(&memcg->res, RES_USAGE); + return res_counter_read_u64(&memcg->res, RES_USAGE); else - val += res_counter_read_u64(&memcg->memsw, RES_USAGE); - - return val; + return res_counter_read_u64(&memcg->memsw, RES_USAGE); } val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); @@ -3962,11 +3941,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) else val = res_counter_read_u64(&memcg->memsw, name); break; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM - case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); - break; -#endif default: BUG(); break; @@ -4696,59 +4670,8 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) #endif /* CONFIG_NUMA */ #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM -static u64 kmem_limit_independent_read(struct cgroup *cgroup, struct cftype *cft) -{ - return mem_cgroup_from_cont(cgroup)->kmem_independent_accounting; -} - -static int kmem_limit_independent_write(struct cgroup *cgroup, struct cftype *cft, - u64 val) -{ - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - - val = !!val; - - /* - * This follows the same hierarchy restrictions than - * mem_cgroup_hierarchy_write() - */ - if (!parent || !parent->use_hierarchy) { - if (list_empty(&cgroup->children)) - memcg->kmem_independent_accounting = val; - else - return -EBUSY; - } - else - return -EINVAL; - - return 0; -} -static struct cftype kmem_cgroup_files[] = { - { - .name = "independent_kmem_limit", - .read_u64 = kmem_limit_independent_read, - .write_u64 = kmem_limit_independent_write, - }, - { - .name = "kmem.usage_in_bytes", - .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), - .read_u64 = mem_cgroup_read, - }, - { - .name = "kmem.limit_in_bytes", - .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), - .read_u64 = mem_cgroup_read, - }, -}; - static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) { - int ret = 0; - - ret = cgroup_add_files(cont, ss, kmem_cgroup_files, - ARRAY_SIZE(kmem_cgroup_files)); - /* * Part of this would be better living in a separate allocation * function, leaving us with just the cgroup tree population work. @@ -4756,9 +4679,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) * is only initialized after cgroup creation. I found the less * cumbersome way to deal with it to defer it all to populate time */ - if (!ret) - ret = mem_cgroup_sockets_init(cont, ss); - return ret; + return mem_cgroup_sockets_init(cont, ss); }; static void kmem_cgroup_destroy(struct cgroup_subsys *ss, @@ -5092,7 +5013,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); - res_counter_init(&memcg->kmem, &parent->kmem); /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5103,7 +5023,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); } memcg->last_scanned_child = 0; memcg->last_scanned_node = MAX_NUMNODES; -- cgit v1.2.3