2 files changed, 269 insertions, 107 deletions
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b8e9628fbf5..cfc1c07bc78f 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -170,112 +170,44 @@ static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
 	vcpu->arch.xen.timer.function = xen_timer_callback;
 }
 
-static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
-	u64 now = get_kvmclock_ns(v->kvm);
-	u64 delta_ns = now - vx->runstate_entry_time;
-	u64 run_delay = current->sched_info.run_delay;
-
-	if (unlikely(!vx->runstate_entry_time))
-		vx->current_runstate = RUNSTATE_offline;
-
-	/*
-	 * Time waiting for the scheduler isn't "stolen" if the
-	 * vCPU wasn't running anyway.
-	 */
-	if (vx->current_runstate == RUNSTATE_running) {
-		u64 steal_ns = run_delay - vx->last_steal;
-
-		delta_ns -= steal_ns;
-
-		vx->runstate_times[RUNSTATE_runnable] += steal_ns;
-	}
-	vx->last_steal = run_delay;
-
-	vx->runstate_times[vx->current_runstate] += delta_ns;
-	vx->current_runstate = state;
-	vx->runstate_entry_time = now;
-}
-
-void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
-{
-	struct kvm_vcpu_xen *vx = &v->arch.xen;
-	struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
-	uint64_t *user_times;
+	struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
+	size_t user_len, user_len1, user_len2;
+	struct vcpu_runstate_info rs;
 	unsigned long flags;
-	size_t user_len;
-	int *user_state;
-
-	kvm_xen_update_runstate(v, state);
-
-	if (!vx->runstate_cache.active)
-		return;
-
-	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-		user_len = sizeof(struct vcpu_runstate_info);
-	else
-		user_len = sizeof(struct compat_vcpu_runstate_info);
-
-	read_lock_irqsave(&gpc->lock, flags);
-	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
-					   user_len)) {
-		read_unlock_irqrestore(&gpc->lock, flags);
-
-		/* When invoked from kvm_sched_out() we cannot sleep */
-		if (state == RUNSTATE_runnable)
-			return;
-
-		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
-			return;
-
-		read_lock_irqsave(&gpc->lock, flags);
-	}
+	size_t times_ofs;
+	uint8_t *update_bit;
+	uint64_t *rs_times;
+	int *rs_state;
 
 	/*
 	 * The only difference between 32-bit and 64-bit versions of the
-	 * runstate struct us the alignment of uint64_t in 32-bit, which
+	 * runstate struct is the alignment of uint64_t in 32-bit, which
 	 * means that the 64-bit version has an additional 4 bytes of
-	 * padding after the first field 'state'.
-	 *
-	 * So we use 'int __user *user_state' to point to the state field,
-	 * and 'uint64_t __user *user_times' for runstate_entry_time. So
-	 * the actual array of time[] in each state starts at user_times[1].
+	 * padding after the first field 'state'. Let's be really really
+	 * paranoid about that, and matching it with our internal data
+	 * structures that we memcpy into it...
 	 */
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
 	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
 	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
 #ifdef CONFIG_X86_64
+	/*
+	 * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
+	 * so each subsequent field is shifted by 4, and it's 4 bytes longer.
+	 */
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
 		     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
 		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
+	BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
 #endif
-
-	user_state = gpc->khva;
-
-	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-		user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
-						  state_entry_time);
-	else
-		user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
-						  state_entry_time);
-
 	/*
-	 * First write the updated state_entry_time at the appropriate
-	 * location determined by 'offset'.
-	 */
-	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
-		     sizeof(user_times[0]));
-	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
-		     sizeof(user_times[0]));
-
-	user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
-	smp_wmb();
-
-	/*
-	 * Next, write the new runstate. This is in the *same* place
-	 * for 32-bit and 64-bit guests, asserted here for paranoia.
+	 * The state field is in the same place at the start of both structs,
+	 * and is the same size (int) as vx->current_runstate.
 	 */
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
 		     offsetof(struct compat_vcpu_runstate_info, state));
@@ -284,34 +216,228 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
 		     sizeof(vx->current_runstate));
 
-	*user_state = vx->current_runstate;
+	/*
+	 * The state_entry_time field is 64 bits in both versions, and the
+	 * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
+	 * is little-endian means that it's in the last *byte* of the word.
+	 * That detail is important later.
+	 */
+	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
+		     sizeof(uint64_t));
+	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
+		     sizeof(uint64_t));
+	BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
 
 	/*
-	 * Write the actual runstate times immediately after the
-	 * runstate_entry_time.
+	 * The time array is four 64-bit quantities in both versions, matching
+	 * the vx->runstate_times and immediately following state_entry_time.
 	 */
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
-		     offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
+		     offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
 	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
-		     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
+		     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
 	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
 		     sizeof_field(struct compat_vcpu_runstate_info, time));
 	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
 		     sizeof(vx->runstate_times));
 
-	memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+		user_len = sizeof(struct vcpu_runstate_info);
+		times_ofs = offsetof(struct vcpu_runstate_info,
+				     state_entry_time);
+	} else {
+		user_len = sizeof(struct compat_vcpu_runstate_info);
+		times_ofs = offsetof(struct compat_vcpu_runstate_info,
+				     state_entry_time);
+	}
+
+	/*
+	 * There are basically no alignment constraints. The guest can set it
+	 * up so it crosses from one page to the next, and at arbitrary byte
+	 * alignment (and the 32-bit ABI doesn't align the 64-bit integers
+	 * anyway, even if the overall struct had been 64-bit aligned).
+	 */
+	if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+		user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+		user_len2 = user_len - user_len1;
+	} else {
+		user_len1 = user_len;
+		user_len2 = 0;
+	}
+	BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+	/*
+	 * Attempt to obtain the GPC lock on *both* (if there are two)
+	 * gfn_to_pfn caches that cover the region.
+	 */
+	read_lock_irqsave(&gpc1->lock, flags);
+	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa, user_len1)) {
+		read_unlock_irqrestore(&gpc1->lock, flags);
+
+		/* When invoked from kvm_sched_out() we cannot sleep */
+		if (atomic)
+			return;
+
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1))
+			return;
+
+		read_lock_irqsave(&gpc1->lock, flags);
+	}
+
+	if (likely(!user_len2)) {
+		/*
+		 * Set up three pointers directly to the runstate_info
+		 * struct in the guest (via the GPC).
+		 *
+		 *  • @rs_state   → state field
+		 *  • @rs_times   → state_entry_time field.
+		 *  • @update_bit → last byte of state_entry_time, which
+		 *                  contains the XEN_RUNSTATE_UPDATE bit.
+		 */
+		rs_state = gpc1->khva;
+		rs_times = gpc1->khva + times_ofs;
+		update_bit = ((void *)(&rs_times[1])) - 1;
+	} else {
+		/*
+		 * The guest's runstate_info is split across two pages and we
+		 * need to hold and validate both GPCs simultaneously. We can
+		 * declare a lock ordering GPC1 > GPC2 because nothing else
+		 * takes them more than one at a time.
+		 */
+		read_lock(&gpc2->lock);
+
+		if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) {
+			read_unlock(&gpc2->lock);
+			read_unlock_irqrestore(&gpc1->lock, flags);
+
+			/* When invoked from kvm_sched_out() we cannot sleep */
+			if (atomic)
+				return;
+
+			/*
+			 * Use kvm_gpc_activate() here because if the runstate
+			 * area was configured in 32-bit mode and only extends
+			 * to the second page now because the guest changed to
+			 * 64-bit mode, the second GPC won't have been set up.
+			 */
+			if (kvm_gpc_activate(v->kvm, gpc2, NULL, KVM_HOST_USES_PFN,
+					     gpc1->gpa + user_len1, user_len2))
+				return;
+
+			/*
+			 * We dropped the lock on GPC1 so we have to go all the
+			 * way back and revalidate that too.
+			 */
+			goto retry;
+		}
+
+		/*
+		 * In this case, the runstate_info struct will be assembled on
+		 * the kernel stack (compat or not as appropriate) and will
+		 * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
+		 * rs pointers accordingly.
+		 */
+		rs_times = &rs.state_entry_time;
+
+		/*
+		 * The rs_state pointer points to the start of what we'll
+		 * copy to the guest, which in the case of a compat guest
+		 * is the 32-bit field that the compiler thinks is padding.
+		 */
+		rs_state = ((void *)rs_times) - times_ofs;
+
+		/*
+		 * The update_bit is still directly in the guest memory,
+		 * via one GPC or the other.
+		 */
+		if (user_len1 >= times_ofs + sizeof(uint64_t))
+			update_bit = gpc1->khva + times_ofs +
+				sizeof(uint64_t) - 1;
+		else
+			update_bit = gpc2->khva + times_ofs +
+				sizeof(uint64_t) - 1 - user_len1;
+
+#ifdef CONFIG_X86_64
+		/*
+		 * Don't leak kernel memory through the padding in the 64-bit
+		 * version of the struct.
+		 */
+		memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
+#endif
+	}
+
+	/*
+	 * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
+	 * state_entry_time field, directly in the guest. We need to set
+	 * that (and write-barrier) before writing to the rest of the
+	 * structure, and clear it last. Just as Xen does, we address the
+	 * single *byte* in which it resides because it might be in a
+	 * different cache line to the rest of the 64-bit word, due to
+	 * the (lack of) alignment constraints.
+	 */
+	*update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
 	smp_wmb();
 
 	/*
-	 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
-	 * runstate_entry_time field.
+	 * Now assemble the actual structure, either on our kernel stack
+	 * or directly in the guest according to how the rs_state and
+	 * rs_times pointers were set up above.
 	 */
-	user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+	*rs_state = vx->current_runstate;
+	rs_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
+	memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+
+	/* For the split case, we have to then copy it to the guest. */
+	if (user_len2) {
+		memcpy(gpc1->khva, rs_state, user_len1);
+		memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
+	}
 	smp_wmb();
 
-	read_unlock_irqrestore(&gpc->lock, flags);
+	/* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
+	*update_bit = vx->runstate_entry_time >> 56;
+	smp_wmb();
 
-	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+	if (user_len2)
+		read_unlock(&gpc2->lock);
+
+	read_unlock_irqrestore(&gpc1->lock, flags);
+
+	mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+	if (user_len2)
+		mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
+}
+
+void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+	struct kvm_vcpu_xen *vx = &v->arch.xen;
+	u64 now = get_kvmclock_ns(v->kvm);
+	u64 delta_ns = now - vx->runstate_entry_time;
+	u64 run_delay = current->sched_info.run_delay;
+
+	if (unlikely(!vx->runstate_entry_time))
+		vx->current_runstate = RUNSTATE_offline;
+
+	/*
+	 * Time waiting for the scheduler isn't "stolen" if the
+	 * vCPU wasn't running anyway.
+	 */
+	if (vx->current_runstate == RUNSTATE_running) {
+		u64 steal_ns = run_delay - vx->last_steal;
+
+		delta_ns -= steal_ns;
+
+		vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+	}
+	vx->last_steal = run_delay;
+
+	vx->runstate_times[vx->current_runstate] += delta_ns;
+	vx->current_runstate = state;
+	vx->runstate_entry_time = now;
+
+	if (vx->runstate_cache.active)
+		kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
 }
 
 static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
@@ -584,23 +710,57 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		break;
 
-	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+		size_t sz, sz1, sz2;
+
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
 			break;
 		}
 		if (data->u.gpa == GPA_INVALID) {
+			r = 0;
+		deactivate_out:
 			kvm_gpc_deactivate(vcpu->kvm,
 					   &vcpu->arch.xen.runstate_cache);
-			r = 0;
+			kvm_gpc_deactivate(vcpu->kvm,
+					   &vcpu->arch.xen.runstate2_cache);
 			break;
 		}
 
+		/*
+		 * If the guest switches to 64-bit mode after setting the runstate
+		 * address, that's actually OK. kvm_xen_update_runstate_guest()
+		 * will cope.
+		 */
+		if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+			sz = sizeof(struct vcpu_runstate_info);
+		else
+			sz = sizeof(struct compat_vcpu_runstate_info);
+
+		/* How much fits in the (first) page? */
+		sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
 		r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
-				     NULL, KVM_HOST_USES_PFN, data->u.gpa,
-				     sizeof(struct vcpu_runstate_info));
-		break;
+				     NULL, KVM_HOST_USES_PFN, data->u.gpa, sz1);
+		if (r)
+			goto deactivate_out;
+
+		/* Either map the second page, or deactivate the second GPC */
+		if (sz1 >= sz) {
+			kvm_gpc_deactivate(vcpu->kvm,
+					   &vcpu->arch.xen.runstate2_cache);
+		} else {
+			sz2 = sz - sz1;
+			BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache,
+					     NULL, KVM_HOST_USES_PFN,
+					     data->u.gpa + sz1, sz2);
+			if (r)
+				goto deactivate_out;
+		}
 
+		kvm_xen_update_runstate_guest(vcpu, false);
+		break;
+	}
 	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
@@ -1834,6 +1994,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
 
 	kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
 }
@@ -1844,6 +2005,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 		kvm_xen_stop_timer(vcpu);
 
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
 
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 532a535a9e99..8503d2c6891e 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -143,11 +143,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
 #include <asm/xen/interface.h>
 #include <xen/interface/vcpu.h>
 
-void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state);
+void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state);
 
 static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
 {
-	kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running);
+	kvm_xen_update_runstate(vcpu, RUNSTATE_running);
 }
 
 static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
@@ -162,7 +162,7 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
 	if (WARN_ON_ONCE(!vcpu->preempted))
 		return;
 
-	kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+	kvm_xen_update_runstate(vcpu, RUNSTATE_runnable);
 }
 
 /* 32-bit compatibility definitions, also used natively in 32-bit build */