diff options
Diffstat (limited to 'mm/page_counter.c')
-rw-r--r-- | mm/page_counter.c | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/mm/page_counter.c b/mm/page_counter.c index db20d6452b71..0153f5bb3161 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -262,3 +262,176 @@ int page_counter_memparse(const char *buf, const char *max, return 0; } + + +/* + * This function calculates an individual page counter's effective + * protection which is derived from its own memory.min/low, its + * parent's and siblings' settings, as well as the actual memory + * distribution in the tree. + * + * The following rules apply to the effective protection values: + * + * 1. At the first level of reclaim, effective protection is equal to + * the declared protection in memory.min and memory.low. + * + * 2. To enable safe delegation of the protection configuration, at + * subsequent levels the effective protection is capped to the + * parent's effective protection. + * + * 3. To make complex and dynamic subtrees easier to configure, the + * user is allowed to overcommit the declared protection at a given + * level. If that is the case, the parent's effective protection is + * distributed to the children in proportion to how much protection + * they have declared and how much of it they are utilizing. + * + * This makes distribution proportional, but also work-conserving: + * if one counter claims much more protection than it uses memory, + * the unused remainder is available to its siblings. + * + * 4. Conversely, when the declared protection is undercommitted at a + * given level, the distribution of the larger parental protection + * budget is NOT proportional. A counter's protection from a sibling + * is capped to its own memory.min/low setting. + * + * 5. However, to allow protecting recursive subtrees from each other + * without having to declare each individual counter's fixed share + * of the ancestor's claim to protection, any unutilized - + * "floating" - protection from up the tree is distributed in + * proportion to each counter's *usage*. This makes the protection + * neutral wrt sibling cgroups and lets them compete freely over + * the shared parental protection budget, but it protects the + * subtree as a whole from neighboring subtrees. + * + * Note that 4. and 5. are not in conflict: 4. is about protecting + * against immediate siblings whereas 5. is about protecting against + * neighboring subtrees. + */ +static unsigned long effective_protection(unsigned long usage, + unsigned long parent_usage, + unsigned long setting, + unsigned long parent_effective, + unsigned long siblings_protected, + bool recursive_protection) +{ + unsigned long protected; + unsigned long ep; + + protected = min(usage, setting); + /* + * If all cgroups at this level combined claim and use more + * protection than what the parent affords them, distribute + * shares in proportion to utilization. + * + * We are using actual utilization rather than the statically + * claimed protection in order to be work-conserving: claimed + * but unused protection is available to siblings that would + * otherwise get a smaller chunk than what they claimed. + */ + if (siblings_protected > parent_effective) + return protected * parent_effective / siblings_protected; + + /* + * Ok, utilized protection of all children is within what the + * parent affords them, so we know whatever this child claims + * and utilizes is effectively protected. + * + * If there is unprotected usage beyond this value, reclaim + * will apply pressure in proportion to that amount. + * + * If there is unutilized protection, the cgroup will be fully + * shielded from reclaim, but we do return a smaller value for + * protection than what the group could enjoy in theory. This + * is okay. With the overcommit distribution above, effective + * protection is always dependent on how memory is actually + * consumed among the siblings anyway. + */ + ep = protected; + + /* + * If the children aren't claiming (all of) the protection + * afforded to them by the parent, distribute the remainder in + * proportion to the (unprotected) memory of each cgroup. That + * way, cgroups that aren't explicitly prioritized wrt each + * other compete freely over the allowance, but they are + * collectively protected from neighboring trees. + * + * We're using unprotected memory for the weight so that if + * some cgroups DO claim explicit protection, we don't protect + * the same bytes twice. + * + * Check both usage and parent_usage against the respective + * protected values. One should imply the other, but they + * aren't read atomically - make sure the division is sane. + */ + if (!recursive_protection) + return ep; + + if (parent_effective > siblings_protected && + parent_usage > siblings_protected && + usage > protected) { + unsigned long unclaimed; + + unclaimed = parent_effective - siblings_protected; + unclaimed *= usage - protected; + unclaimed /= parent_usage - siblings_protected; + + ep += unclaimed; + } + + return ep; +} + + +/** + * page_counter_calculate_protection - check if memory consumption is in the normal range + * @root: the top ancestor of the sub-tree being checked + * @counter: the page_counter the counter to update + * @recursive_protection: Whether to use memory_recursiveprot behavior. + * + * Calculates elow/emin thresholds for given page_counter. + * + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. + */ +void page_counter_calculate_protection(struct page_counter *root, + struct page_counter *counter, + bool recursive_protection) +{ + unsigned long usage, parent_usage; + struct page_counter *parent = counter->parent; + + /* + * Effective values of the reclaim targets are ignored so they + * can be stale. Have a look at mem_cgroup_protection for more + * details. + * TODO: calculation should be more robust so that we do not need + * that special casing. + */ + if (root == counter) + return; + + usage = page_counter_read(counter); + if (!usage) + return; + + if (parent == root) { + counter->emin = READ_ONCE(counter->min); + counter->elow = READ_ONCE(counter->low); + return; + } + + parent_usage = page_counter_read(parent); + + WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, + READ_ONCE(counter->min), + READ_ONCE(parent->emin), + atomic_long_read(&parent->children_min_usage), + recursive_protection)); + + WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, + READ_ONCE(counter->low), + READ_ONCE(parent->elow), + atomic_long_read(&parent->children_low_usage), + recursive_protection)); +} |