15 files changed, 534 insertions, 101 deletions
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index eca9e6354017..fbe7198a715a 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -208,7 +208,13 @@ int node_affinity_init(void)
 	return 0;
 }
 
-void node_affinity_destroy(void)
+static void node_affinity_destroy(struct hfi1_affinity_node *entry)
+{
+	free_percpu(entry->comp_vect_affinity);
+	kfree(entry);
+}
+
+void node_affinity_destroy_all(void)
 {
 	struct list_head *pos, *q;
 	struct hfi1_affinity_node *entry;
@@ -218,7 +224,7 @@ void node_affinity_destroy(void)
 		entry = list_entry(pos, struct hfi1_affinity_node,
 				   list);
 		list_del(pos);
-		kfree(entry);
+		node_affinity_destroy(entry);
 	}
 	mutex_unlock(&node_affinity.lock);
 	kfree(hfi1_per_node_cntr);
@@ -232,6 +238,7 @@ static struct hfi1_affinity_node *node_affinity_allocate(int node)
 	if (!entry)
 		return NULL;
 	entry->node = node;
+	entry->comp_vect_affinity = alloc_percpu(u16);
 	INIT_LIST_HEAD(&entry->list);
 
 	return entry;
@@ -261,6 +268,341 @@ static struct hfi1_affinity_node *node_affinity_lookup(int node)
 	return NULL;
 }
 
+static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
+				u16 __percpu *comp_vect_affinity)
+{
+	int curr_cpu;
+	u16 cntr;
+	u16 prev_cntr;
+	int ret_cpu;
+
+	if (!possible_cpumask) {
+		ret_cpu = -EINVAL;
+		goto fail;
+	}
+
+	if (!comp_vect_affinity) {
+		ret_cpu = -EINVAL;
+		goto fail;
+	}
+
+	ret_cpu = cpumask_first(possible_cpumask);
+	if (ret_cpu >= nr_cpu_ids) {
+		ret_cpu = -EINVAL;
+		goto fail;
+	}
+
+	prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
+	for_each_cpu(curr_cpu, possible_cpumask) {
+		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
+
+		if (cntr < prev_cntr) {
+			ret_cpu = curr_cpu;
+			prev_cntr = cntr;
+		}
+	}
+
+	*per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
+
+fail:
+	return ret_cpu;
+}
+
+static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
+				    u16 __percpu *comp_vect_affinity)
+{
+	int curr_cpu;
+	int max_cpu;
+	u16 cntr;
+	u16 prev_cntr;
+
+	if (!possible_cpumask)
+		return -EINVAL;
+
+	if (!comp_vect_affinity)
+		return -EINVAL;
+
+	max_cpu = cpumask_first(possible_cpumask);
+	if (max_cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
+	for_each_cpu(curr_cpu, possible_cpumask) {
+		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
+
+		if (cntr > prev_cntr) {
+			max_cpu = curr_cpu;
+			prev_cntr = cntr;
+		}
+	}
+
+	*per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
+
+	return max_cpu;
+}
+
+/*
+ * Non-interrupt CPUs are used first, then interrupt CPUs.
+ * Two already allocated cpu masks must be passed.
+ */
+static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
+				  struct hfi1_affinity_node *entry,
+				  cpumask_var_t non_intr_cpus,
+				  cpumask_var_t available_cpus)
+	__must_hold(&node_affinity.lock)
+{
+	int cpu;
+	struct cpu_mask_set *set = dd->comp_vect;
+
+	lockdep_assert_held(&node_affinity.lock);
+	if (!non_intr_cpus) {
+		cpu = -1;
+		goto fail;
+	}
+
+	if (!available_cpus) {
+		cpu = -1;
+		goto fail;
+	}
+
+	/* Available CPUs for pinning completion vectors */
+	_cpu_mask_set_gen_inc(set);
+	cpumask_andnot(available_cpus, &set->mask, &set->used);
+
+	/* Available CPUs without SDMA engine interrupts */
+	cpumask_andnot(non_intr_cpus, available_cpus,
+		       &entry->def_intr.used);
+
+	/* If there are non-interrupt CPUs available, use them first */
+	if (!cpumask_empty(non_intr_cpus))
+		cpu = cpumask_first(non_intr_cpus);
+	else /* Otherwise, use interrupt CPUs */
+		cpu = cpumask_first(available_cpus);
+
+	if (cpu >= nr_cpu_ids) { /* empty */
+		cpu = -1;
+		goto fail;
+	}
+	cpumask_set_cpu(cpu, &set->used);
+
+fail:
+	return cpu;
+}
+
+static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
+{
+	struct cpu_mask_set *set = dd->comp_vect;
+
+	if (cpu < 0)
+		return;
+
+	cpu_mask_set_put(set, cpu);
+}
+
+/* _dev_comp_vect_mappings_destroy() is reentrant */
+static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
+{
+	int i, cpu;
+
+	if (!dd->comp_vect_mappings)
+		return;
+
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		cpu = dd->comp_vect_mappings[i];
+		_dev_comp_vect_cpu_put(dd, cpu);
+		dd->comp_vect_mappings[i] = -1;
+		hfi1_cdbg(AFFINITY,
+			  "[%s] Release CPU %d from completion vector %d",
+			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
+	}
+
+	kfree(dd->comp_vect_mappings);
+	dd->comp_vect_mappings = NULL;
+}
+
+/*
+ * This function creates the table for looking up CPUs for completion vectors.
+ * num_comp_vectors needs to have been initilized before calling this function.
+ */
+static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
+					  struct hfi1_affinity_node *entry)
+	__must_hold(&node_affinity.lock)
+{
+	int i, cpu, ret;
+	cpumask_var_t non_intr_cpus;
+	cpumask_var_t available_cpus;
+
+	lockdep_assert_held(&node_affinity.lock);
+
+	if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
+		free_cpumask_var(non_intr_cpus);
+		return -ENOMEM;
+	}
+
+	dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
+					 sizeof(*dd->comp_vect_mappings),
+					 GFP_KERNEL);
+	if (!dd->comp_vect_mappings) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++)
+		dd->comp_vect_mappings[i] = -1;
+
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
+					     available_cpus);
+		if (cpu < 0) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		dd->comp_vect_mappings[i] = cpu;
+		hfi1_cdbg(AFFINITY,
+			  "[%s] Completion Vector %d -> CPU %d",
+			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
+	}
+
+	return 0;
+
+fail:
+	free_cpumask_var(available_cpus);
+	free_cpumask_var(non_intr_cpus);
+	_dev_comp_vect_mappings_destroy(dd);
+
+	return ret;
+}
+
+int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
+{
+	int ret;
+	struct hfi1_affinity_node *entry;
+
+	mutex_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	if (!entry) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+	ret = _dev_comp_vect_mappings_create(dd, entry);
+unlock:
+	mutex_unlock(&node_affinity.lock);
+
+	return ret;
+}
+
+void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
+{
+	_dev_comp_vect_mappings_destroy(dd);
+}
+
+int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
+{
+	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+
+	if (!dd->comp_vect_mappings)
+		return -EINVAL;
+	if (comp_vect >= dd->comp_vect_possible_cpus)
+		return -EINVAL;
+
+	return dd->comp_vect_mappings[comp_vect];
+}
+
+/*
+ * It assumes dd->comp_vect_possible_cpus is available.
+ */
+static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
+					struct hfi1_affinity_node *entry,
+					bool first_dev_init)
+	__must_hold(&node_affinity.lock)
+{
+	int i, j, curr_cpu;
+	int possible_cpus_comp_vect = 0;
+	struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
+
+	lockdep_assert_held(&node_affinity.lock);
+	/*
+	 * If there's only one CPU available for completion vectors, then
+	 * there will only be one completion vector available. Othewise,
+	 * the number of completion vector available will be the number of
+	 * available CPUs divide it by the number of devices in the
+	 * local NUMA node.
+	 */
+	if (cpumask_weight(&entry->comp_vect_mask) == 1) {
+		possible_cpus_comp_vect = 1;
+		dd_dev_warn(dd,
+			    "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
+	} else {
+		possible_cpus_comp_vect +=
+			cpumask_weight(&entry->comp_vect_mask) /
+				       hfi1_per_node_cntr[dd->node];
+
+		/*
+		 * If the completion vector CPUs available doesn't divide
+		 * evenly among devices, then the first device device to be
+		 * initialized gets an extra CPU.
+		 */
+		if (first_dev_init &&
+		    cpumask_weight(&entry->comp_vect_mask) %
+		    hfi1_per_node_cntr[dd->node] != 0)
+			possible_cpus_comp_vect++;
+	}
+
+	dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
+
+	/* Reserving CPUs for device completion vector */
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
+						entry->comp_vect_affinity);
+		if (curr_cpu < 0)
+			goto fail;
+
+		cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
+	}
+
+	hfi1_cdbg(AFFINITY,
+		  "[%s] Completion vector affinity CPU set(s) %*pbl",
+		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
+		  cpumask_pr_args(dev_comp_vect_mask));
+
+	return 0;
+
+fail:
+	for (j = 0; j < i; j++)
+		per_cpu_affinity_put_max(&entry->comp_vect_mask,
+					 entry->comp_vect_affinity);
+
+	return curr_cpu;
+}
+
+/*
+ * It assumes dd->comp_vect_possible_cpus is available.
+ */
+static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
+					     struct hfi1_affinity_node *entry)
+	__must_hold(&node_affinity.lock)
+{
+	int i, cpu;
+
+	lockdep_assert_held(&node_affinity.lock);
+	if (!dd->comp_vect_possible_cpus)
+		return;
+
+	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
+		cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
+					       entry->comp_vect_affinity);
+		/* Clearing CPU in device completion vector cpu mask */
+		if (cpu >= 0)
+			cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
+	}
+
+	dd->comp_vect_possible_cpus = 0;
+}
+
 /*
  * Interrupt affinity.
  *
@@ -277,7 +619,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 	int node = pcibus_to_node(dd->pcidev->bus);
 	struct hfi1_affinity_node *entry;
 	const struct cpumask *local_mask;
-	int curr_cpu, possible, i;
+	int curr_cpu, possible, i, ret;
+	bool new_entry = false;
 
 	if (node < 0)
 		node = numa_node_id();
@@ -299,11 +642,14 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 		if (!entry) {
 			dd_dev_err(dd,
 				   "Unable to allocate global affinity node\n");
-			mutex_unlock(&node_affinity.lock);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto fail;
 		}
+		new_entry = true;
+
 		init_cpu_mask_set(&entry->def_intr);
 		init_cpu_mask_set(&entry->rcv_intr);
+		cpumask_clear(&entry->comp_vect_mask);
 		cpumask_clear(&entry->general_intr_mask);
 		/* Use the "real" cpu mask of this node as the default */
 		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
@@ -356,10 +702,64 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 					     &entry->general_intr_mask);
 		}
 
-		node_affinity_add_tail(entry);
+		/* Determine completion vector CPUs for the entire node */
+		cpumask_and(&entry->comp_vect_mask,
+			    &node_affinity.real_cpu_mask, local_mask);
+		cpumask_andnot(&entry->comp_vect_mask,
+			       &entry->comp_vect_mask,
+			       &entry->rcv_intr.mask);
+		cpumask_andnot(&entry->comp_vect_mask,
+			       &entry->comp_vect_mask,
+			       &entry->general_intr_mask);
+
+		/*
+		 * If there ends up being 0 CPU cores leftover for completion
+		 * vectors, use the same CPU core as the general/control
+		 * context.
+		 */
+		if (cpumask_weight(&entry->comp_vect_mask) == 0)
+			cpumask_copy(&entry->comp_vect_mask,
+				     &entry->general_intr_mask);
 	}
+
+	ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
+	if (ret < 0)
+		goto fail;
+
+	if (new_entry)
+		node_affinity_add_tail(entry);
+
 	mutex_unlock(&node_affinity.lock);
+
 	return 0;
+
+fail:
+	if (new_entry)
+		node_affinity_destroy(entry);
+	mutex_unlock(&node_affinity.lock);
+	return ret;
+}
+
+void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
+{
+	struct hfi1_affinity_node *entry;
+
+	if (dd->node < 0)
+		return;
+
+	mutex_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	if (!entry)
+		goto unlock;
+
+	/*
+	 * Free device completion vector CPUs to be used by future
+	 * completion vectors
+	 */
+	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
+unlock:
+	mutex_unlock(&node_affinity.lock);
+	dd->node = -1;
 }
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
index 2a1e374169c0..6a7e6ea4e426 100644
--- a/drivers/infiniband/hw/hfi1/affinity.h
+++ b/drivers/infiniband/hw/hfi1/affinity.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -98,9 +98,11 @@ void hfi1_put_proc_affinity(int cpu);
 
 struct hfi1_affinity_node {
 	int node;
+	u16 __percpu *comp_vect_affinity;
 	struct cpu_mask_set def_intr;
 	struct cpu_mask_set rcv_intr;
 	struct cpumask general_intr_mask;
+	struct cpumask comp_vect_mask;
 	struct list_head list;
 };
 
@@ -116,7 +118,11 @@ struct hfi1_affinity_node_list {
 };
 
 int node_affinity_init(void);
-void node_affinity_destroy(void);
+void node_affinity_destroy_all(void);
 extern struct hfi1_affinity_node_list node_affinity;
+void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd);
+int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect);
+int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd);
+void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd);
 
 #endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 0fab6df0a345..46e9e4ffcba4 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -15233,6 +15233,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 	if (ret)
 		goto bail_cleanup;
 
+	ret = hfi1_comp_vectors_set_up(dd);
+	if (ret)
+		goto bail_clear_intr;
+
 	/* set up LCB access - must be after set_up_interrupts() */
 	init_lcb_access(dd);
 
@@ -15275,6 +15279,7 @@ bail_free_rcverr:
 bail_free_cntrs:
 	free_cntrs(dd);
 bail_clear_intr:
+	hfi1_comp_vectors_clean_up(dd);
 	hfi1_clean_up_interrupts(dd);
 bail_cleanup:
 	hfi1_pcie_ddcleanup(dd);
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 9cd758ce7764..dd84238c1aac 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1263,6 +1263,9 @@ struct hfi1_devdata {
 
 	/* Save the enabled LCB error bits */
 	u64 lcb_err_en;
+	struct cpu_mask_set *comp_vect;
+	int *comp_vect_mappings;
+	u32 comp_vect_possible_cpus;
 
 	/*
 	 * Capability to have different send engines simply by changing a
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 790542ce89a5..5d1adfc450d3 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015-2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -1244,6 +1244,8 @@ static void hfi1_clean_devdata(struct hfi1_devdata *dd)
 	dd->rcv_limit     = NULL;
 	dd->send_schedule = NULL;
 	dd->tx_opstats    = NULL;
+	kfree(dd->comp_vect);
+	dd->comp_vect = NULL;
 	sdma_clean(dd, dd->num_sdma);
 	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
@@ -1300,6 +1302,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 		dd->unit = ret;
 		list_add(&dd->list, &hfi1_dev_list);
 	}
+	dd->node = -1;
 
 	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
 	idr_preload_end();
@@ -1352,6 +1355,12 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 		goto bail;
 	}
 
+	dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL);
+	if (!dd->comp_vect) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
 	kobject_init(&dd->kobj, &hfi1_devdata_type);
 	return dd;
 
@@ -1521,7 +1530,7 @@ module_init(hfi1_mod_init);
 static void __exit hfi1_mod_cleanup(void)
 {
 	pci_unregister_driver(&hfi1_pci_driver);
-	node_affinity_destroy();
+	node_affinity_destroy_all();
 	hfi1_wss_exit();
 	hfi1_dbg_exit();
 
@@ -1605,6 +1614,8 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
 static void postinit_cleanup(struct hfi1_devdata *dd)
 {
 	hfi1_start_cleanup(dd);
+	hfi1_comp_vectors_clean_up(dd);
+	hfi1_dev_affinity_clean_up(dd);
 
 	hfi1_pcie_ddcleanup(dd);
 	hfi1_pcie_cleanup(dd->pcidev);
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 89bd9851065b..332b9b7c554a 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -374,6 +374,7 @@ const char *print_u32_array(
 	return ret;
 }
 
+__hfi1_trace_fn(AFFINITY);
 __hfi1_trace_fn(PKT);
 __hfi1_trace_fn(PROC);
 __hfi1_trace_fn(SDMA);
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index 0e7d929530c5..e62171fb7379 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -1,5 +1,5 @@
 /*
-* Copyright(c) 2015, 2016 Intel Corporation.
+* Copyright(c) 2015 - 2018 Intel Corporation.
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
@@ -113,6 +113,7 @@ void __hfi1_trace_##lvl(const char *func, char *fmt, ...)		\
  * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
  * the debugfs stuff.
  */
+__hfi1_trace_def(AFFINITY);
 __hfi1_trace_def(PKT);
 __hfi1_trace_def(PROC);
 __hfi1_trace_def(SDMA);
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 9554e912af98..fc2e44cde161 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -64,6 +64,7 @@
 #include "debugfs.h"
 #include "vnic.h"
 #include "fault.h"
+#include "affinity.h"
 
 static unsigned int hfi1_lkey_table_size = 16;
 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
@@ -1934,11 +1935,11 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
 	dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
 	dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+	dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
+						hfi1_comp_vect_mappings_lookup;
 
 	/* completeion queue */
-	snprintf(dd->verbs_dev.rdi.dparms.cq_name,
-		 sizeof(dd->verbs_dev.rdi.dparms.cq_name),
-		 "hfi1_cq%d", dd->unit);
+	dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus;
 	dd->verbs_dev.rdi.dparms.node = dd->node;
 
 	/* misc settings */
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 3977abbc83ad..14b4057a2b8f 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2012 - 2018 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -1631,10 +1631,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB;
 	dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE;
 
-	snprintf(dd->verbs_dev.rdi.dparms.cq_name,
-		 sizeof(dd->verbs_dev.rdi.dparms.cq_name),
-		 "qib_cq%d", dd->unit);
-
 	qib_fill_device_attr(dd);
 
 	ppd = dd->pport;
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 340c17aba3b0..4f1544ad4aff 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -47,11 +47,12 @@
 
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/kthread.h>
 #include "cq.h"
 #include "vt.h"
 #include "trace.h"
 
+static struct workqueue_struct *comp_vector_wq;
+
 /**
  * rvt_cq_enter - add a new entry to the completion queue
  * @cq: completion queue
@@ -120,27 +121,21 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 	if (cq->notify == IB_CQ_NEXT_COMP ||
 	    (cq->notify == IB_CQ_SOLICITED &&
 	     (solicited || entry->status != IB_WC_SUCCESS))) {
-		struct kthread_worker *worker;
-
 		/*
 		 * This will cause send_complete() to be called in
 		 * another thread.
 		 */
-		rcu_read_lock();
-		worker = rcu_dereference(cq->rdi->worker);
-		if (likely(worker)) {
-			cq->notify = RVT_CQ_NONE;
-			cq->triggered++;
-			kthread_queue_work(worker, &cq->comptask);
-		}
-		rcu_read_unlock();
+		cq->notify = RVT_CQ_NONE;
+		cq->triggered++;
+		queue_work_on(cq->comp_vector_cpu, comp_vector_wq,
+			      &cq->comptask);
 	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
 }
 EXPORT_SYMBOL(rvt_cq_enter);
 
-static void send_complete(struct kthread_work *work)
+static void send_complete(struct work_struct *work)
 {
 	struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask);
 
@@ -192,6 +187,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	struct ib_cq *ret;
 	u32 sz;
 	unsigned int entries = attr->cqe;
+	int comp_vector = attr->comp_vector;
 
 	if (attr->flags)
 		return ERR_PTR(-EINVAL);
@@ -199,6 +195,11 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	if (entries < 1 || entries > rdi->dparms.props.max_cqe)
 		return ERR_PTR(-EINVAL);
 
+	if (comp_vector < 0)
+		comp_vector = 0;
+
+	comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
+
 	/* Allocate the completion queue structure. */
 	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
 	if (!cq)
@@ -267,14 +268,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 	 * an error.
 	 */
 	cq->rdi = rdi;
+	if (rdi->driver_f.comp_vect_cpu_lookup)
+		cq->comp_vector_cpu =
+			rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector);
+	else
+		cq->comp_vector_cpu =
+			cpumask_first(cpumask_of_node(rdi->dparms.node));
+
 	cq->ibcq.cqe = entries;
 	cq->notify = RVT_CQ_NONE;
 	spin_lock_init(&cq->lock);
-	kthread_init_work(&cq->comptask, send_complete);
+	INIT_WORK(&cq->comptask, send_complete);
 	cq->queue = wc;
 
 	ret = &cq->ibcq;
 
+	trace_rvt_create_cq(cq, attr);
 	goto done;
 
 bail_ip:
@@ -300,7 +309,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq)
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 	struct rvt_dev_info *rdi = cq->rdi;
 
-	kthread_flush_work(&cq->comptask);
+	flush_work(&cq->comptask);
 	spin_lock_irq(&rdi->n_cqs_lock);
 	rdi->n_cqs_allocated--;
 	spin_unlock_irq(&rdi->n_cqs_lock);
@@ -510,24 +519,13 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
  *
  * Return: 0 on success
  */
-int rvt_driver_cq_init(struct rvt_dev_info *rdi)
+int rvt_driver_cq_init(void)
 {
-	int cpu;
-	struct kthread_worker *worker;
-
-	if (rcu_access_pointer(rdi->worker))
-		return 0;
-
-	spin_lock_init(&rdi->n_cqs_lock);
-
-	cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
-	worker = kthread_create_worker_on_cpu(cpu, 0,
-					      "%s", rdi->dparms.cq_name);
-	if (IS_ERR(worker))
-		return PTR_ERR(worker);
+	comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+					 0, "rdmavt_cq");
+	if (!comp_vector_wq)
+		return -ENOMEM;
 
-	set_user_nice(worker->task, MIN_NICE);
-	RCU_INIT_POINTER(rdi->worker, worker);
 	return 0;
 }
 
@@ -535,23 +533,8 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
  * rvt_cq_exit - tear down cq reources
  * @rdi: rvt dev structure
  */
-void rvt_cq_exit(struct rvt_dev_info *rdi)
+void rvt_cq_exit(void)
 {
-	struct kthread_worker *worker;
-
-	if (!rcu_access_pointer(rdi->worker))
-		return;
-
-	spin_lock(&rdi->n_cqs_lock);
-	worker = rcu_dereference_protected(rdi->worker,
-					   lockdep_is_held(&rdi->n_cqs_lock));
-	if (!worker) {
-		spin_unlock(&rdi->n_cqs_lock);
-		return;
-	}
-	RCU_INIT_POINTER(rdi->worker, NULL);
-	spin_unlock(&rdi->n_cqs_lock);
-	synchronize_rcu();
-
-	kthread_destroy_worker(worker);
+	destroy_workqueue(comp_vector_wq);
+	comp_vector_wq = NULL;
 }
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
index 6182c29eff66..72184b1c176b 100644
--- a/drivers/infiniband/sw/rdmavt/cq.h
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -2,7 +2,7 @@
 #define DEF_RVTCQ_H
 
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -59,6 +59,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq);
 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-int rvt_driver_cq_init(struct rvt_dev_info *rdi);
-void rvt_cq_exit(struct rvt_dev_info *rdi);
+int rvt_driver_cq_init(void);
+void rvt_cq_exit(void);
 #endif          /* DEF_RVTCQ_H */
diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h
index a315850aa9bb..df8e1adbef9d 100644
--- a/drivers/infiniband/sw/rdmavt/trace_cq.h
+++ b/drivers/infiniband/sw/rdmavt/trace_cq.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -71,6 +71,39 @@ __print_symbolic(opcode,                                      \
 	wc_opcode_name(RECV),                                 \
 	wc_opcode_name(RECV_RDMA_WITH_IMM))
 
+#define CQ_ATTR_PRINT \
+"[%s] user cq %s cqe %u comp_vector %d comp_vector_cpu %d flags %x"
+
+DECLARE_EVENT_CLASS(rvt_cq_template,
+		    TP_PROTO(struct rvt_cq *cq,
+			     const struct ib_cq_init_attr *attr),
+		    TP_ARGS(cq, attr),
+		    TP_STRUCT__entry(RDI_DEV_ENTRY(cq->rdi)
+				     __field(struct rvt_mmap_info *, ip)
+				     __field(unsigned int, cqe)
+				     __field(int, comp_vector)
+				     __field(int, comp_vector_cpu)
+				     __field(u32, flags)
+				     ),
+		    TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi)
+				   __entry->ip = cq->ip;
+				   __entry->cqe = attr->cqe;
+				   __entry->comp_vector = attr->comp_vector;
+				   __entry->comp_vector_cpu =
+							cq->comp_vector_cpu;
+				   __entry->flags = attr->flags;
+				   ),
+		    TP_printk(CQ_ATTR_PRINT, __get_str(dev),
+			      __entry->ip ? "true" : "false", __entry->cqe,
+			      __entry->comp_vector, __entry->comp_vector_cpu,
+			      __entry->flags
+			      )
+);
+
+DEFINE_EVENT(rvt_cq_template, rvt_create_cq,
+	     TP_PROTO(struct rvt_cq *cq, const struct ib_cq_init_attr *attr),
+	     TP_ARGS(cq, attr));
+
 #define CQ_PRN \
 "[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x"
 
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 434199d0bc96..17e4abc067af 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -49,6 +49,7 @@
 #include <linux/kernel.h>
 #include <linux/dma-mapping.h>
 #include "vt.h"
+#include "cq.h"
 #include "trace.h"
 
 #define RVT_UVERBS_ABI_VERSION 2
@@ -58,21 +59,18 @@ MODULE_DESCRIPTION("RDMA Verbs Transport Library");
 
 static int rvt_init(void)
 {
-	/*
-	 * rdmavt does not need to do anything special when it starts up. All it
-	 * needs to do is sit and wait until a driver attempts registration.
-	 */
-	return 0;
+	int ret = rvt_driver_cq_init();
+
+	if (ret)
+		pr_err("Error in driver CQ init.\n");
+
+	return ret;
 }
 module_init(rvt_init);
 
 static void rvt_cleanup(void)
 {
-	/*
-	 * Nothing to do at exit time either. The module won't be able to be
-	 * removed until all drivers are gone which means all the dev structs
-	 * are gone so there is really nothing to do.
-	 */
+	rvt_cq_exit();
 }
 module_exit(rvt_cleanup);
 
@@ -777,11 +775,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 	}
 
 	/* Completion queues */
-	ret = rvt_driver_cq_init(rdi);
-	if (ret) {
-		pr_err("Error in driver CQ init.\n");
-		goto bail_mr;
-	}
+	spin_lock_init(&rdi->n_cqs_lock);
 
 	/* DMA Operations */
 	rdi->ibdev.dev.dma_ops = rdi->ibdev.dev.dma_ops ? : &dma_virt_ops;
@@ -829,14 +823,15 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
 		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
 	rdi->ibdev.node_type = RDMA_NODE_IB_CA;
-	rdi->ibdev.num_comp_vectors = 1;
+	if (!rdi->ibdev.num_comp_vectors)
+		rdi->ibdev.num_comp_vectors = 1;
 
 	rdi->ibdev.driver_id = driver_id;
 	/* We are now good to announce we exist */
 	ret =  ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback);
 	if (ret) {
 		rvt_pr_err(rdi, "Failed to register driver with ib core.\n");
-		goto bail_cq;
+		goto bail_mr;
 	}
 
 	rvt_create_mad_agents(rdi);
@@ -844,9 +839,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
 	rvt_pr_info(rdi, "Registration with rdmavt done.\n");
 	return ret;
 
-bail_cq:
-	rvt_cq_exit(rdi);
-
 bail_mr:
 	rvt_mr_exit(rdi);
 
@@ -870,7 +862,6 @@ void rvt_unregister_device(struct rvt_dev_info *rdi)
 	rvt_free_mad_agents(rdi);
 
 	ib_unregister_device(&rdi->ibdev);
-	rvt_cq_exit(rdi);
 	rvt_mr_exit(rdi);
 	rvt_qp_exit(rdi);
 }
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index eec495e68823..e79229a0cf01 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -2,7 +2,7 @@
 #define DEF_RDMA_VT_H
 
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -167,7 +167,6 @@ struct rvt_driver_params {
 	int qpn_res_end;
 	int nports;
 	int npkeys;
-	char cq_name[RVT_CQN_MAX];
 	int node;
 	int psn_mask;
 	int psn_shift;
@@ -347,6 +346,9 @@ struct rvt_driver_provided {
 
 	/* Notify driver to restart rc */
 	void (*notify_restart_rc)(struct rvt_qp *qp, u32 psn, int wait);
+
+	/* Get and return CPU to pin CQ processing thread */
+	int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect);
 };
 
 struct rvt_dev_info {
@@ -402,7 +404,6 @@ struct rvt_dev_info {
 	spinlock_t pending_lock; /* protect pending mmap list */
 
 	/* CQ */
-	struct kthread_worker __rcu *worker; /* per device cq worker */
 	u32 n_cqs_allocated;    /* number of CQs allocated for device */
 	spinlock_t n_cqs_lock; /* protect count of in use cqs */
 
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
index 51fd00b243d0..75dc65c0bfb8 100644
--- a/include/rdma/rdmavt_cq.h
+++ b/include/rdma/rdmavt_cq.h
@@ -8,7 +8,7 @@
  *
  * GPL LICENSE SUMMARY
  *
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -80,10 +80,11 @@ struct rvt_cq_wc {
  */
 struct rvt_cq {
 	struct ib_cq ibcq;
-	struct kthread_work comptask;
+	struct work_struct comptask;
 	spinlock_t lock; /* protect changes in this struct */
 	u8 notify;
 	u8 triggered;
+	int comp_vector_cpu;
 	struct rvt_dev_info *rdi;
 	struct rvt_cq_wc *queue;
 	struct rvt_mmap_info *ip;