8 files changed, 130 insertions, 60 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index fbe1a48bd629..13f2d19793e3 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -22,6 +22,10 @@ config KVM_MMIO
 config KVM_ASYNC_PF
        bool
 
+# Toggle to switch between direct notification and batch job
+config KVM_ASYNC_PF_SYNC
+       bool
+
 config HAVE_KVM_MSI
        bool
 
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 5081e809821f..22fa819a9b6a 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -277,7 +277,7 @@ int kvm_timer_hyp_init(void)
 
 	host_vtimer_irq = ppi;
 
-	err = register_cpu_notifier(&kvm_timer_cpu_nb);
+	err = __register_cpu_notifier(&kvm_timer_cpu_nb);
 	if (err) {
 		kvm_err("Cannot register timer CPU notifier\n");
 		goto out_free;
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 8ca405cd7c1a..47b29834a6b6 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1496,7 +1496,7 @@ int kvm_vgic_hyp_init(void)
 		goto out;
 	}
 
-	ret = register_cpu_notifier(&vgic_cpu_nb);
+	ret = __register_cpu_notifier(&vgic_cpu_nb);
 	if (ret) {
 		kvm_err("Cannot register vgic CPU notifier\n");
 		goto out_free_irq;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 8631d9c14320..10df100c4514 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -28,6 +28,21 @@
 #include "async_pf.h"
 #include <trace/events/kvm.h>
 
+static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu,
+					       struct kvm_async_pf *work)
+{
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+	kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu,
+						struct kvm_async_pf *work)
+{
+#ifndef CONFIG_KVM_ASYNC_PF_SYNC
+	kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+
 static struct kmem_cache *async_pf_cache;
 
 int kvm_async_pf_init(void)
@@ -69,6 +84,7 @@ static void async_pf_execute(struct work_struct *work)
 	down_read(&mm->mmap_sem);
 	get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
 	up_read(&mm->mmap_sem);
+	kvm_async_page_present_sync(vcpu, apf);
 	unuse_mm(mm);
 
 	spin_lock(&vcpu->async_pf.lock);
@@ -97,11 +113,16 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
 			list_entry(vcpu->async_pf.queue.next,
 				   typeof(*work), queue);
 		list_del(&work->queue);
+
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+		flush_work(&work->work);
+#else
 		if (cancel_work_sync(&work->work)) {
 			mmdrop(work->mm);
 			kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
 			kmem_cache_free(async_pf_cache, work);
 		}
+#endif
 	}
 
 	spin_lock(&vcpu->async_pf.lock);
@@ -130,7 +151,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 		spin_unlock(&vcpu->async_pf.lock);
 
 		kvm_arch_async_page_ready(vcpu, work);
-		kvm_arch_async_page_present(vcpu, work);
+		kvm_async_page_present_async(vcpu, work);
 
 		list_del(&work->queue);
 		vcpu->async_pf.queued--;
@@ -138,7 +159,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 	}
 }
 
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 		       struct kvm_arch_async_pf *arch)
 {
 	struct kvm_async_pf *work;
@@ -159,7 +180,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 	work->wakeup_all = false;
 	work->vcpu = vcpu;
 	work->gva = gva;
-	work->addr = gfn_to_hva(vcpu->kvm, gfn);
+	work->addr = hva;
 	work->arch = *arch;
 	work->mm = current->mm;
 	atomic_inc(&work->mm->mm_count);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index abe4d6043b36..29c2a04e036e 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -391,19 +391,19 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 					   lockdep_is_held(&kvm->irqfds.lock));
 	irqfd_update(kvm, irqfd, irq_rt);
 
-	events = f.file->f_op->poll(f.file, &irqfd->pt);
-
 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
 
+	spin_unlock_irq(&kvm->irqfds.lock);
+
 	/*
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered, and trigger it as if we didn't miss it.
 	 */
+	events = f.file->f_op->poll(f.file, &irqfd->pt);
+
 	if (events & POLLIN)
 		schedule_work(&irqfd->inject);
 
-	spin_unlock_irq(&kvm->irqfds.lock);
-
 	/*
 	 * do not drop the file until the irqfd is fully initialized, otherwise
 	 * we might race against the POLLHUP
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ce9ed99ad7dc..d4b601547f1f 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -50,7 +50,7 @@
 #else
 #define ioapic_debug(fmt, arg...)
 #endif
-static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq,
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
 		bool line_status);
 
 static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -163,23 +163,67 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
 	return false;
 }
 
-static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx,
-		bool line_status)
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+		int irq_level, bool line_status)
 {
-	union kvm_ioapic_redirect_entry *pent;
-	int injected = -1;
+	union kvm_ioapic_redirect_entry entry;
+	u32 mask = 1 << irq;
+	u32 old_irr;
+	int edge, ret;
 
-	pent = &ioapic->redirtbl[idx];
+	entry = ioapic->redirtbl[irq];
+	edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
 
-	if (!pent->fields.mask) {
-		injected = ioapic_deliver(ioapic, idx, line_status);
-		if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-			pent->fields.remote_irr = 1;
+	if (!irq_level) {
+		ioapic->irr &= ~mask;
+		ret = 1;
+		goto out;
+	}
+
+	/*
+	 * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+	 * this only happens if a previous edge has not been delivered due
+	 * do masking.  For level interrupts, the remote_irr field tells
+	 * us if the interrupt is waiting for an EOI.
+	 *
+	 * RTC is special: it is edge-triggered, but userspace likes to know
+	 * if it has been already ack-ed via EOI because coalesced RTC
+	 * interrupts lead to time drift in Windows guests.  So we track
+	 * EOI manually for the RTC interrupt.
+	 */
+	if (irq == RTC_GSI && line_status &&
+		rtc_irq_check_coalesced(ioapic)) {
+		ret = 0;
+		goto out;
 	}
 
-	return injected;
+	old_irr = ioapic->irr;
+	ioapic->irr |= mask;
+	if ((edge && old_irr == ioapic->irr) ||
+	    (!edge && entry.fields.remote_irr)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+	trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+	return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+	u32 idx;
+
+	rtc_irq_eoi_tracking_reset(ioapic);
+	for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+		ioapic_set_irq(ioapic, idx, 1, true);
+
+	kvm_rtc_eoi_tracking_restore_all(ioapic);
 }
 
+
 static void update_handled_vectors(struct kvm_ioapic *ioapic)
 {
 	DECLARE_BITMAP(handled_vectors, 256);
@@ -282,12 +326,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 {
 	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
 	struct kvm_lapic_irq irqe;
 	int ret;
 
+	if (entry->fields.mask)
+		return -1;
+
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
 		     entry->fields.dest_id, entry->fields.dest_mode,
@@ -302,6 +349,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
 	irqe.level = 1;
 	irqe.shorthand = 0;
 
+	if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+		ioapic->irr &= ~(1 << irq);
+
 	if (irq == RTC_GSI && line_status) {
 		BUG_ON(ioapic->rtc_status.pending_eoi != 0);
 		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
@@ -310,45 +360,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
 	} else
 		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
 
+	if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+		entry->fields.remote_irr = 1;
+
 	return ret;
 }
 
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
 		       int level, bool line_status)
 {
-	u32 old_irr;
-	u32 mask = 1 << irq;
-	union kvm_ioapic_redirect_entry entry;
 	int ret, irq_level;
 
 	BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
 
 	spin_lock(&ioapic->lock);
-	old_irr = ioapic->irr;
 	irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
 					 irq_source_id, level);
-	entry = ioapic->redirtbl[irq];
-	irq_level ^= entry.fields.polarity;
-	if (!irq_level) {
-		ioapic->irr &= ~mask;
-		ret = 1;
-	} else {
-		int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+	ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
 
-		if (irq == RTC_GSI && line_status &&
-			rtc_irq_check_coalesced(ioapic)) {
-			ret = 0; /* coalesced */
-			goto out;
-		}
-		ioapic->irr |= mask;
-		if ((edge && old_irr != ioapic->irr) ||
-		    (!edge && !entry.fields.remote_irr))
-			ret = ioapic_service(ioapic, irq, line_status);
-		else
-			ret = 0; /* report coalesced interrupt */
-	}
-out:
-	trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
 	spin_unlock(&ioapic->lock);
 
 	return ret;
@@ -394,7 +423,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
-		if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+		if (ioapic->irr & (1 << i))
 			ioapic_service(ioapic, i, false);
 	}
 }
@@ -595,9 +624,10 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 
 	spin_lock(&ioapic->lock);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+	ioapic->irr = 0;
 	update_handled_vectors(ioapic);
 	kvm_vcpu_request_scan_ioapic(kvm);
-	kvm_rtc_eoi_tracking_restore_all(ioapic);
+	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
 	return 0;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5ec7fb986f6..56baae8c2f56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -186,12 +186,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-	long dirty_count = kvm->tlbs_dirty;
-
-	smp_mb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
-	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+	kvm->tlbs_dirty = false;
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 
@@ -1804,7 +1801,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				continue;
 			if (vcpu == me)
 				continue;
-			if (waitqueue_active(&vcpu->wq))
+			if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
 				continue;
 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
 				continue;
@@ -2284,6 +2281,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 		ops = &kvm_arm_vgic_v2_ops;
 		break;
 #endif
+#ifdef CONFIG_S390
+	case KVM_DEV_TYPE_FLIC:
+		ops = &kvm_flic_ops;
+		break;
+#endif
 	default:
 		return -ENODEV;
 	}
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index b4f9507ae650..ba1a93f935c7 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -59,6 +59,22 @@ static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
 	symbol_put(vfio_group_put_external_user);
 }
 
+static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
+{
+	long (*fn)(struct vfio_group *, unsigned long);
+	long ret;
+
+	fn = symbol_get(vfio_external_check_extension);
+	if (!fn)
+		return false;
+
+	ret = fn(vfio_group, VFIO_DMA_CC_IOMMU);
+
+	symbol_put(vfio_external_check_extension);
+
+	return ret > 0;
+}
+
 /*
  * Groups can use the same or different IOMMU domains.  If the same then
  * adding a new group may change the coherency of groups we've previously
@@ -75,13 +91,10 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev)
 	mutex_lock(&kv->lock);
 
 	list_for_each_entry(kvg, &kv->group_list, node) {
-		/*
-		 * TODO: We need an interface to check the coherency of
-		 * the IOMMU domain this group is using.  For now, assume
-		 * it's always noncoherent.
-		 */
-		noncoherent = true;
-		break;
+		if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) {
+			noncoherent = true;
+			break;
+		}
 	}
 
 	if (noncoherent != kv->noncoherent) {