2 files changed, 164 insertions, 92 deletions
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index eac1b3d83a86..ec685c1f3b78 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -46,10 +46,9 @@ static const struct kvm_irq_level default_vtimer_irq = {
 	.level	= 1,
 };
 
-void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	vcpu_vtimer(vcpu)->active_cleared_last = false;
-}
+static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
+				 struct arch_timer_context *timer_ctx);
 
 u64 kvm_phys_timer_read(void)
 {
@@ -69,17 +68,45 @@ static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
 		cancel_work_sync(work);
 }
 
-static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
+static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 
 	/*
-	 * We disable the timer in the world switch and let it be
-	 * handled by kvm_timer_sync_hwstate(). Getting a timer
-	 * interrupt at this point is a sure sign of some major
-	 * breakage.
+	 * When using a userspace irqchip with the architected timers, we must
+	 * prevent continuously exiting from the guest, and therefore mask the
+	 * physical interrupt by disabling it on the host interrupt controller
+	 * when the virtual level is high, such that the guest can make
+	 * forward progress.  Once we detect the output level being
+	 * de-asserted, we unmask the interrupt again so that we exit from the
+	 * guest when the timer fires.
 	 */
-	pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu);
+	if (vtimer->irq.level)
+		disable_percpu_irq(host_vtimer_irq);
+	else
+		enable_percpu_irq(host_vtimer_irq, 0);
+}
+
+static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
+{
+	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
+	struct arch_timer_context *vtimer;
+
+	if (!vcpu) {
+		pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n");
+		return IRQ_NONE;
+	}
+	vtimer = vcpu_vtimer(vcpu);
+
+	if (!vtimer->irq.level) {
+		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+		if (kvm_timer_irq_can_fire(vtimer))
+			kvm_timer_update_irq(vcpu, true, vtimer);
+	}
+
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		kvm_vtimer_update_mask_user(vcpu);
+
 	return IRQ_HANDLED;
 }
 
@@ -215,7 +242,6 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 {
 	int ret;
 
-	timer_ctx->active_cleared_last = false;
 	timer_ctx->irq.level = new_level;
 	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
 				   timer_ctx->irq.level);
@@ -271,10 +297,16 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu,
 	soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(timer_ctx));
 }
 
-static void timer_save_state(struct kvm_vcpu *vcpu)
+static void vtimer_save_state(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	if (!vtimer->loaded)
+		goto out;
 
 	if (timer->enabled) {
 		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
@@ -283,6 +315,10 @@ static void timer_save_state(struct kvm_vcpu *vcpu)
 
 	/* Disable the virtual timer */
 	write_sysreg_el0(0, cntv_ctl);
+
+	vtimer->loaded = false;
+out:
+	local_irq_restore(flags);
 }
 
 /*
@@ -296,6 +332,8 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu)
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
+	vtimer_save_state(vcpu);
+
 	/*
 	 * No need to schedule a background timer if any guest timer has
 	 * already expired, because kvm_vcpu_block will return before putting
@@ -318,22 +356,34 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu)
 	soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
 }
 
-static void timer_restore_state(struct kvm_vcpu *vcpu)
+static void vtimer_restore_state(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	if (vtimer->loaded)
+		goto out;
 
 	if (timer->enabled) {
 		write_sysreg_el0(vtimer->cnt_cval, cntv_cval);
 		isb();
 		write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
 	}
+
+	vtimer->loaded = true;
+out:
+	local_irq_restore(flags);
 }
 
 void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
+	vtimer_restore_state(vcpu);
+
 	soft_timer_cancel(&timer->bg_timer, &timer->expired);
 }
 
@@ -352,61 +402,45 @@ static void set_cntvoff(u64 cntvoff)
 	kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
 }
 
-static void kvm_timer_flush_hwstate_vgic(struct kvm_vcpu *vcpu)
+static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	bool phys_active;
 	int ret;
 
-	/*
-	* If we enter the guest with the virtual input level to the VGIC
-	* asserted, then we have already told the VGIC what we need to, and
-	* we don't need to exit from the guest until the guest deactivates
-	* the already injected interrupt, so therefore we should set the
-	* hardware active state to prevent unnecessary exits from the guest.
-	*
-	* Also, if we enter the guest with the virtual timer interrupt active,
-	* then it must be active on the physical distributor, because we set
-	* the HW bit and the guest must be able to deactivate the virtual and
-	* physical interrupt at the same time.
-	*
-	* Conversely, if the virtual input level is deasserted and the virtual
-	* interrupt is not active, then always clear the hardware active state
-	* to ensure that hardware interrupts from the timer triggers a guest
-	* exit.
-	*/
 	phys_active = vtimer->irq.level ||
-			kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
-
-	/*
-	 * We want to avoid hitting the (re)distributor as much as
-	 * possible, as this is a potentially expensive MMIO access
-	 * (not to mention locks in the irq layer), and a solution for
-	 * this is to cache the "active" state in memory.
-	 *
-	 * Things to consider: we cannot cache an "active set" state,
-	 * because the HW can change this behind our back (it becomes
-	 * "clear" in the HW). We must then restrict the caching to
-	 * the "clear" state.
-	 *
-	 * The cache is invalidated on:
-	 * - vcpu put, indicating that the HW cannot be trusted to be
-	 *   in a sane state on the next vcpu load,
-	 * - any change in the interrupt state
-	 *
-	 * Usage conditions:
-	 * - cached value is "active clear"
-	 * - value to be programmed is "active clear"
-	 */
-	if (vtimer->active_cleared_last && !phys_active)
-		return;
+		      kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
 
 	ret = irq_set_irqchip_state(host_vtimer_irq,
 				    IRQCHIP_STATE_ACTIVE,
 				    phys_active);
 	WARN_ON(ret);
+}
 
-	vtimer->active_cleared_last = !phys_active;
+static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu)
+{
+	kvm_vtimer_update_mask_user(vcpu);
+}
+
+void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+	if (unlikely(!timer->enabled))
+		return;
+
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		kvm_timer_vcpu_load_user(vcpu);
+	else
+		kvm_timer_vcpu_load_vgic(vcpu);
+
+	set_cntvoff(vtimer->cntvoff);
+
+	vtimer_restore_state(vcpu);
+
+	if (has_vhe())
+		disable_el1_phys_timer_access();
 }
 
 bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
@@ -426,23 +460,6 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
 	       ptimer->irq.level != plevel;
 }
 
-static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-	/*
-	 * To prevent continuously exiting from the guest, we mask the
-	 * physical interrupt such that the guest can make forward progress.
-	 * Once we detect the output level being deasserted, we unmask the
-	 * interrupt again so that we exit from the guest when the timer
-	 * fires.
-	*/
-	if (vtimer->irq.level)
-		disable_percpu_irq(host_vtimer_irq);
-	else
-		enable_percpu_irq(host_vtimer_irq, 0);
-}
-
 /**
  * kvm_timer_flush_hwstate - prepare timers before running the vcpu
  * @vcpu: The vcpu pointer
@@ -455,23 +472,61 @@ static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu)
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
 	if (unlikely(!timer->enabled))
 		return;
 
-	kvm_timer_update_state(vcpu);
+	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
+		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
 
 	/* Set the background timer for the physical timer emulation. */
 	phys_timer_emulate(vcpu, vcpu_ptimer(vcpu));
+}
 
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		kvm_timer_flush_hwstate_user(vcpu);
-	else
-		kvm_timer_flush_hwstate_vgic(vcpu);
+void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
-	set_cntvoff(vtimer->cntvoff);
-	timer_restore_state(vcpu);
+	if (unlikely(!timer->enabled))
+		return;
+
+	if (has_vhe())
+		enable_el1_phys_timer_access();
+
+	vtimer_save_state(vcpu);
+
+	/*
+	 * The kernel may decide to run userspace after calling vcpu_put, so
+	 * we reset cntvoff to 0 to ensure a consistent read between user
+	 * accesses to the virtual counter and kernel access to the physical
+	 * counter.
+	 */
+	set_cntvoff(0);
+}
+
+static void unmask_vtimer_irq(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
+		kvm_vtimer_update_mask_user(vcpu);
+		return;
+	}
+
+	/*
+	 * If the guest disabled the timer without acking the interrupt, then
+	 * we must make sure the physical and virtual active states are in
+	 * sync by deactivating the physical interrupt, because otherwise we
+	 * wouldn't see the next timer interrupt in the host.
+	 */
+	if (!kvm_vgic_map_is_active(vcpu, vtimer->irq.irq)) {
+		int ret;
+		ret = irq_set_irqchip_state(host_vtimer_irq,
+					    IRQCHIP_STATE_ACTIVE,
+					    false);
+		WARN_ON(ret);
+	}
 }
 
 /**
@@ -484,6 +539,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 
 	/*
 	 * This is to cancel the background timer for the physical timer
@@ -491,14 +547,19 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 	 */
 	soft_timer_cancel(&timer->phys_timer, NULL);
 
-	timer_save_state(vcpu);
-	set_cntvoff(0);
-
 	/*
-	 * The guest could have modified the timer registers or the timer
-	 * could have expired, update the timer state.
+	 * If we entered the guest with the vtimer output asserted we have to
+	 * check if the guest has modified the timer so that we should lower
+	 * the line at this point.
 	 */
-	kvm_timer_update_state(vcpu);
+	if (vtimer->irq.level) {
+		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+		vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
+		if (!kvm_timer_should_fire(vtimer)) {
+			kvm_timer_update_irq(vcpu, false, vtimer);
+			unmask_vtimer_irq(vcpu);
+		}
+	}
 }
 
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 27db222a0c8d..132d39ae13d2 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -354,18 +354,18 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
 	kvm_arm_set_running_vcpu(vcpu);
-
 	kvm_vgic_load(vcpu);
+	kvm_timer_vcpu_load(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	kvm_timer_vcpu_put(vcpu);
 	kvm_vgic_put(vcpu);
 
 	vcpu->cpu = -1;
 
 	kvm_arm_set_running_vcpu(NULL);
-	kvm_timer_vcpu_put(vcpu);
 }
 
 static void vcpu_power_off(struct kvm_vcpu *vcpu)
@@ -710,16 +710,27 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		kvm_arm_clear_debug(vcpu);
 
 		/*
-		 * We must sync the PMU and timer state before the vgic state so
+		 * We must sync the PMU state before the vgic state so
 		 * that the vgic can properly sample the updated state of the
 		 * interrupt line.
 		 */
 		kvm_pmu_sync_hwstate(vcpu);
-		kvm_timer_sync_hwstate(vcpu);
 
+		/*
+		 * Sync the vgic state before syncing the timer state because
+		 * the timer code needs to know if the virtual timer
+		 * interrupts are active.
+		 */
 		kvm_vgic_sync_hwstate(vcpu);
 
 		/*
+		 * Sync the timer hardware state before enabling interrupts as
+		 * we don't want vtimer interrupts to race with syncing the
+		 * timer virtual interrupt state.
+		 */
+		kvm_timer_sync_hwstate(vcpu);
+
+		/*
 		 * We may have taken a host interrupt in HYP mode (ie
 		 * while executing the guest). This interrupt is still
 		 * pending, as we haven't serviced it yet!