1 files changed, 309 insertions, 32 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8d465478adb9..903be39bd2d1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -126,6 +126,9 @@ enum wq_internal_consts {
  *
  * L: pool->lock protected.  Access with pool->lock held.
  *
+ * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
+ *     reads.
+ *
  * K: Only modified by worker while holding pool->lock. Can be safely read by
  *    self, while holding pool->lock or from IRQ context if %current is the
  *    kworker.
@@ -247,17 +250,18 @@ struct pool_workqueue {
 	 * pwq->inactive_works instead of pool->worklist and marked with
 	 * WORK_STRUCT_INACTIVE.
 	 *
-	 * All work items marked with WORK_STRUCT_INACTIVE do not participate
-	 * in pwq->nr_active and all work items in pwq->inactive_works are
-	 * marked with WORK_STRUCT_INACTIVE.  But not all WORK_STRUCT_INACTIVE
-	 * work items are in pwq->inactive_works.  Some of them are ready to
-	 * run in pool->worklist or worker->scheduled.  Those work itmes are
-	 * only struct wq_barrier which is used for flush_work() and should
-	 * not participate in pwq->nr_active.  For non-barrier work item, it
-	 * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
+	 * All work items marked with WORK_STRUCT_INACTIVE do not participate in
+	 * nr_active and all work items in pwq->inactive_works are marked with
+	 * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
+	 * in pwq->inactive_works. Some of them are ready to run in
+	 * pool->worklist or worker->scheduled. Those work itmes are only struct
+	 * wq_barrier which is used for flush_work() and should not participate
+	 * in nr_active. For non-barrier work item, it is marked with
+	 * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
 	 */
 	int			nr_active;	/* L: nr of active works */
 	struct list_head	inactive_works;	/* L: inactive works */
+	struct list_head	pending_node;	/* LN: node on wq_node_nr_active->pending_pwqs */
 	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
 	struct list_head	mayday_node;	/* MD: node on wq->maydays */
 
@@ -289,9 +293,19 @@ struct wq_device;
  * on each CPU, in an unbound workqueue, max_active applies to the whole system.
  * As sharing a single nr_active across multiple sockets can be very expensive,
  * the counting and enforcement is per NUMA node.
+ *
+ * The following struct is used to enforce per-node max_active. When a pwq wants
+ * to start executing a work item, it should increment ->nr using
+ * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
+ * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
+ * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
+ * round-robin order.
  */
 struct wq_node_nr_active {
-	atomic_t		nr;		/* per-node nr_active count */
+	int			max;		/* per-node max_active */
+	atomic_t		nr;		/* per-node nr_active */
+	raw_spinlock_t		lock;		/* nests inside pool locks */
+	struct list_head	pending_pwqs;	/* LN: pwqs with inactive works */
 };
 
 /*
@@ -314,8 +328,12 @@ struct workqueue_struct {
 	struct worker		*rescuer;	/* MD: rescue worker */
 
 	int			nr_drainers;	/* WQ: drain in progress */
+
+	/* See alloc_workqueue() function comment for info on min/max_active */
 	int			max_active;	/* WO: max active works */
+	int			min_active;	/* WO: min active works */
 	int			saved_max_active; /* WQ: saved max_active */
+	int			saved_min_active; /* WQ: saved min_active */
 
 	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
 	struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */
@@ -667,6 +685,19 @@ static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
 				     lockdep_is_held(&wq->mutex));
 }
 
+/**
+ * unbound_effective_cpumask - effective cpumask of an unbound workqueue
+ * @wq: workqueue of interest
+ *
+ * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
+ * is masked with wq_unbound_cpumask to determine the effective cpumask. The
+ * default pwq is always mapped to the pool with the current effective cpumask.
+ */
+static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
+{
+	return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
+}
+
 static unsigned int work_color_to_flags(int color)
 {
 	return color << WORK_STRUCT_COLOR_SHIFT;
@@ -1462,6 +1493,46 @@ static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
 }
 
 /**
+ * wq_update_node_max_active - Update per-node max_actives to use
+ * @wq: workqueue to update
+ * @off_cpu: CPU that's going down, -1 if a CPU is not going down
+ *
+ * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
+ * distributed among nodes according to the proportions of numbers of online
+ * cpus. The result is always between @wq->min_active and max_active.
+ */
+static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
+{
+	struct cpumask *effective = unbound_effective_cpumask(wq);
+	int min_active = READ_ONCE(wq->min_active);
+	int max_active = READ_ONCE(wq->max_active);
+	int total_cpus, node;
+
+	lockdep_assert_held(&wq->mutex);
+
+	if (!cpumask_test_cpu(off_cpu, effective))
+		off_cpu = -1;
+
+	total_cpus = cpumask_weight_and(effective, cpu_online_mask);
+	if (off_cpu >= 0)
+		total_cpus--;
+
+	for_each_node(node) {
+		int node_cpus;
+
+		node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
+		if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
+			node_cpus--;
+
+		wq_node_nr_active(wq, node)->max =
+			clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
+			      min_active, max_active);
+	}
+
+	wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
+}
+
+/**
  * get_pwq - get an extra reference on the specified pool_workqueue
  * @pwq: pool_workqueue to get
  *
@@ -1558,35 +1629,98 @@ static bool pwq_activate_work(struct pool_workqueue *pwq,
 	return true;
 }
 
+static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
+{
+	int max = READ_ONCE(nna->max);
+
+	while (true) {
+		int old, tmp;
+
+		old = atomic_read(&nna->nr);
+		if (old >= max)
+			return false;
+		tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
+		if (tmp == old)
+			return true;
+	}
+}
+
 /**
  * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
  * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
  *
  * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
  * successfully obtained. %false otherwise.
  */
-static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
+static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
 {
 	struct workqueue_struct *wq = pwq->wq;
 	struct worker_pool *pool = pwq->pool;
 	struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
-	bool obtained;
+	bool obtained = false;
 
 	lockdep_assert_held(&pool->lock);
 
-	obtained = pwq->nr_active < READ_ONCE(wq->max_active);
+	if (!nna) {
+		/* per-cpu workqueue, pwq->nr_active is sufficient */
+		obtained = pwq->nr_active < READ_ONCE(wq->max_active);
+		goto out;
+	}
+
+	/*
+	 * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
+	 * already waiting on $nna, pwq_dec_nr_active() will maintain the
+	 * concurrency level. Don't jump the line.
+	 *
+	 * We need to ignore the pending test after max_active has increased as
+	 * pwq_dec_nr_active() can only maintain the concurrency level but not
+	 * increase it. This is indicated by @fill.
+	 */
+	if (!list_empty(&pwq->pending_node) && likely(!fill))
+		goto out;
+
+	obtained = tryinc_node_nr_active(nna);
+	if (obtained)
+		goto out;
+
+	/*
+	 * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
+	 * and try again. The smp_mb() is paired with the implied memory barrier
+	 * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
+	 * we see the decremented $nna->nr or they see non-empty
+	 * $nna->pending_pwqs.
+	 */
+	raw_spin_lock(&nna->lock);
+
+	if (list_empty(&pwq->pending_node))
+		list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
+	else if (likely(!fill))
+		goto out_unlock;
+
+	smp_mb();
+
+	obtained = tryinc_node_nr_active(nna);
 
-	if (obtained) {
+	/*
+	 * If @fill, @pwq might have already been pending. Being spuriously
+	 * pending in cold paths doesn't affect anything. Let's leave it be.
+	 */
+	if (obtained && likely(!fill))
+		list_del_init(&pwq->pending_node);
+
+out_unlock:
+	raw_spin_unlock(&nna->lock);
+out:
+	if (obtained)
 		pwq->nr_active++;
-		if (nna)
-			atomic_inc(&nna->nr);
-	}
 	return obtained;
 }
 
 /**
  * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
  * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
  *
  * Activate the first inactive work item of @pwq if available and allowed by
  * max_active limit.
@@ -1594,13 +1728,13 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
  * Returns %true if an inactive work item has been activated. %false if no
  * inactive work item is found or max_active limit is reached.
  */
-static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
+static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
 {
 	struct work_struct *work =
 		list_first_entry_or_null(&pwq->inactive_works,
 					 struct work_struct, entry);
 
-	if (work && pwq_tryinc_nr_active(pwq)) {
+	if (work && pwq_tryinc_nr_active(pwq, fill)) {
 		__pwq_activate_work(pwq, work);
 		return true;
 	} else {
@@ -1609,10 +1743,92 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
 }
 
 /**
+ * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
+ * @nna: wq_node_nr_active to activate a pending pwq for
+ * @caller_pool: worker_pool the caller is locking
+ *
+ * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
+ * @caller_pool may be unlocked and relocked to lock other worker_pools.
+ */
+static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
+				      struct worker_pool *caller_pool)
+{
+	struct worker_pool *locked_pool = caller_pool;
+	struct pool_workqueue *pwq;
+	struct work_struct *work;
+
+	lockdep_assert_held(&caller_pool->lock);
+
+	raw_spin_lock(&nna->lock);
+retry:
+	pwq = list_first_entry_or_null(&nna->pending_pwqs,
+				       struct pool_workqueue, pending_node);
+	if (!pwq)
+		goto out_unlock;
+
+	/*
+	 * If @pwq is for a different pool than @locked_pool, we need to lock
+	 * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
+	 * / lock dance. For that, we also need to release @nna->lock as it's
+	 * nested inside pool locks.
+	 */
+	if (pwq->pool != locked_pool) {
+		raw_spin_unlock(&locked_pool->lock);
+		locked_pool = pwq->pool;
+		if (!raw_spin_trylock(&locked_pool->lock)) {
+			raw_spin_unlock(&nna->lock);
+			raw_spin_lock(&locked_pool->lock);
+			raw_spin_lock(&nna->lock);
+			goto retry;
+		}
+	}
+
+	/*
+	 * $pwq may not have any inactive work items due to e.g. cancellations.
+	 * Drop it from pending_pwqs and see if there's another one.
+	 */
+	work = list_first_entry_or_null(&pwq->inactive_works,
+					struct work_struct, entry);
+	if (!work) {
+		list_del_init(&pwq->pending_node);
+		goto retry;
+	}
+
+	/*
+	 * Acquire an nr_active count and activate the inactive work item. If
+	 * $pwq still has inactive work items, rotate it to the end of the
+	 * pending_pwqs so that we round-robin through them. This means that
+	 * inactive work items are not activated in queueing order which is fine
+	 * given that there has never been any ordering across different pwqs.
+	 */
+	if (likely(tryinc_node_nr_active(nna))) {
+		pwq->nr_active++;
+		__pwq_activate_work(pwq, work);
+
+		if (list_empty(&pwq->inactive_works))
+			list_del_init(&pwq->pending_node);
+		else
+			list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
+
+		/* if activating a foreign pool, make sure it's running */
+		if (pwq->pool != caller_pool)
+			kick_pool(pwq->pool);
+	}
+
+out_unlock:
+	raw_spin_unlock(&nna->lock);
+	if (locked_pool != caller_pool) {
+		raw_spin_unlock(&locked_pool->lock);
+		raw_spin_lock(&caller_pool->lock);
+	}
+}
+
+/**
  * pwq_dec_nr_active - Retire an active count
  * @pwq: pool_workqueue of interest
  *
  * Decrement @pwq's nr_active and try to activate the first inactive work item.
+ * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
  */
 static void pwq_dec_nr_active(struct pool_workqueue *pwq)
 {
@@ -1632,12 +1848,29 @@ static void pwq_dec_nr_active(struct pool_workqueue *pwq)
 	 * inactive work item on @pwq itself.
 	 */
 	if (!nna) {
-		pwq_activate_first_inactive(pwq);
+		pwq_activate_first_inactive(pwq, false);
 		return;
 	}
 
-	atomic_dec(&nna->nr);
-	pwq_activate_first_inactive(pwq);
+	/*
+	 * If @pwq is for an unbound workqueue, it's more complicated because
+	 * multiple pwqs and pools may be sharing the nr_active count. When a
+	 * pwq needs to wait for an nr_active count, it puts itself on
+	 * $nna->pending_pwqs. The following atomic_dec_return()'s implied
+	 * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
+	 * guarantee that either we see non-empty pending_pwqs or they see
+	 * decremented $nna->nr.
+	 *
+	 * $nna->max may change as CPUs come online/offline and @pwq->wq's
+	 * max_active gets updated. However, it is guaranteed to be equal to or
+	 * larger than @pwq->wq->min_active which is above zero unless freezing.
+	 * This maintains the forward progress guarantee.
+	 */
+	if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
+		return;
+
+	if (!list_empty(&nna->pending_pwqs))
+		node_activate_pending_pwq(nna, pool);
 }
 
 /**
@@ -1965,7 +2198,7 @@ retry:
 	 * @work must also queue behind existing inactive work items to maintain
 	 * ordering when max_active changes. See wq_adjust_max_active().
 	 */
-	if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq)) {
+	if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
 		if (list_empty(&pool->worklist))
 			pool->watchdog_ts = jiffies;
 
@@ -3200,7 +3433,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 
 	barr->task = current;
 
-	/* The barrier work item does not participate in pwq->nr_active. */
+	/* The barrier work item does not participate in nr_active. */
 	work_flags |= WORK_STRUCT_INACTIVE;
 
 	/*
@@ -4116,6 +4349,8 @@ static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
 static void init_node_nr_active(struct wq_node_nr_active *nna)
 {
 	atomic_set(&nna->nr, 0);
+	raw_spin_lock_init(&nna->lock);
+	INIT_LIST_HEAD(&nna->pending_pwqs);
 }
 
 /*
@@ -4355,6 +4590,15 @@ static void pwq_release_workfn(struct kthread_work *work)
 		mutex_unlock(&wq_pool_mutex);
 	}
 
+	if (!list_empty(&pwq->pending_node)) {
+		struct wq_node_nr_active *nna =
+			wq_node_nr_active(pwq->wq, pwq->pool->node);
+
+		raw_spin_lock_irq(&nna->lock);
+		list_del_init(&pwq->pending_node);
+		raw_spin_unlock_irq(&nna->lock);
+	}
+
 	call_rcu(&pwq->rcu, rcu_free_pwq);
 
 	/*
@@ -4380,6 +4624,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
 	pwq->flush_color = -1;
 	pwq->refcnt = 1;
 	INIT_LIST_HEAD(&pwq->inactive_works);
+	INIT_LIST_HEAD(&pwq->pending_node);
 	INIT_LIST_HEAD(&pwq->pwqs_node);
 	INIT_LIST_HEAD(&pwq->mayday_node);
 	kthread_init_work(&pwq->release_work, pwq_release_workfn);
@@ -4587,6 +4832,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
 							ctx->pwq_tbl[cpu]);
 	ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
 
+	/* update node_nr_active->max */
+	wq_update_node_max_active(ctx->wq, -1);
+
 	mutex_unlock(&ctx->wq->mutex);
 }
 
@@ -4850,24 +5098,35 @@ static int init_rescuer(struct workqueue_struct *wq)
 static void wq_adjust_max_active(struct workqueue_struct *wq)
 {
 	bool activated;
+	int new_max, new_min;
 
 	lockdep_assert_held(&wq->mutex);
 
 	if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
-		WRITE_ONCE(wq->max_active, 0);
-		return;
+		new_max = 0;
+		new_min = 0;
+	} else {
+		new_max = wq->saved_max_active;
+		new_min = wq->saved_min_active;
 	}
 
-	if (wq->max_active == wq->saved_max_active)
+	if (wq->max_active == new_max && wq->min_active == new_min)
 		return;
 
 	/*
-	 * Update @wq->max_active and then kick inactive work items if more
+	 * Update @wq->max/min_active and then kick inactive work items if more
 	 * active work items are allowed. This doesn't break work item ordering
 	 * because new work items are always queued behind existing inactive
 	 * work items if there are any.
 	 */
-	WRITE_ONCE(wq->max_active, wq->saved_max_active);
+	WRITE_ONCE(wq->max_active, new_max);
+	WRITE_ONCE(wq->min_active, new_min);
+
+	if (wq->flags & WQ_UNBOUND)
+		wq_update_node_max_active(wq, -1);
+
+	if (new_max == 0)
+		return;
 
 	/*
 	 * Round-robin through pwq's activating the first inactive work item
@@ -4882,7 +5141,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq)
 
 			/* can be called during early boot w/ irq disabled */
 			raw_spin_lock_irqsave(&pwq->pool->lock, flags);
-			if (pwq_activate_first_inactive(pwq)) {
+			if (pwq_activate_first_inactive(pwq, true)) {
 				activated = true;
 				kick_pool(pwq->pool);
 			}
@@ -4944,7 +5203,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	/* init wq */
 	wq->flags = flags;
 	wq->max_active = max_active;
-	wq->saved_max_active = max_active;
+	wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
+	wq->saved_max_active = wq->max_active;
+	wq->saved_min_active = wq->min_active;
 	mutex_init(&wq->mutex);
 	atomic_set(&wq->nr_pwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->pwqs);
@@ -5110,7 +5371,8 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
  * @wq: target workqueue
  * @max_active: new max_active value.
  *
- * Set max_active of @wq to @max_active.
+ * Set max_active of @wq to @max_active. See the alloc_workqueue() function
+ * comment.
  *
  * CONTEXT:
  * Don't call from IRQ context.
@@ -5127,6 +5389,9 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 
 	wq->flags &= ~__WQ_ORDERED;
 	wq->saved_max_active = max_active;
+	if (wq->flags & WQ_UNBOUND)
+		wq->saved_min_active = min(wq->saved_min_active, max_active);
+
 	wq_adjust_max_active(wq);
 
 	mutex_unlock(&wq->mutex);
@@ -5808,6 +6073,10 @@ int workqueue_online_cpu(unsigned int cpu)
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
 				wq_update_pod(wq, tcpu, cpu, true);
+
+			mutex_lock(&wq->mutex);
+			wq_update_node_max_active(wq, -1);
+			mutex_unlock(&wq->mutex);
 		}
 	}
 
@@ -5836,6 +6105,10 @@ int workqueue_offline_cpu(unsigned int cpu)
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
 				wq_update_pod(wq, tcpu, cpu, false);
+
+			mutex_lock(&wq->mutex);
+			wq_update_node_max_active(wq, cpu);
+			mutex_unlock(&wq->mutex);
 		}
 	}
 	mutex_unlock(&wq_pool_mutex);
@@ -7127,8 +7400,12 @@ void __init workqueue_init_topology(void)
 	 * combinations to apply per-pod sharing.
 	 */
 	list_for_each_entry(wq, &workqueues, list) {
-		for_each_online_cpu(cpu) {
+		for_each_online_cpu(cpu)
 			wq_update_pod(wq, cpu, cpu, true);
+		if (wq->flags & WQ_UNBOUND) {
+			mutex_lock(&wq->mutex);
+			wq_update_node_max_active(wq, -1);
+			mutex_unlock(&wq->mutex);
 		}
 	}