summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c33
1 files changed, 31 insertions, 2 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 06db566c7660..1d1dd88daaab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1097,6 +1097,9 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
/* The page with hint page fault latency < threshold in ms is considered hot */
unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
struct numa_group {
refcount_t refcount;
@@ -1501,6 +1504,29 @@ static int numa_hint_fault_latency(struct page *page)
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1515,7 +1541,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!node_is_toptier(src_nid)) {
struct pglist_data *pgdat;
- unsigned long latency, th;
+ unsigned long rate_limit, latency, th;
pgdat = NODE_DATA(dst_nid);
if (pgdat_free_space_enough(pgdat))
@@ -1526,7 +1552,10 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
if (latency >= th)
return false;
- return true;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ thp_nr_pages(page));
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);