Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio updates from Rusty Russell: "OK, this has the big virtio 1.0 implementation, as specified by OASIS. On top of tht is the major rework of lguest, to use PCI and virtio 1.0, to double-check the implementation. Then comes the inevitable fixes and cleanups from that work" * tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (80 commits) virtio: don't set VIRTIO_CONFIG_S_DRIVER_OK twice. virtio_net: unconditionally define struct virtio_net_hdr_v1. tools/lguest: don't use legacy definitions for net device in example launcher. virtio: Don't expose legacy net features when VIRTIO_NET_NO_LEGACY defined. tools/lguest: use common error macros in the example launcher. tools/lguest: give virtqueues names for better error messages tools/lguest: more documentation and checking of virtio 1.0 compliance. lguest: don't look in console features to find emerg_wr. tools/lguest: don't start devices until DRIVER_OK status set. tools/lguest: handle indirect partway through chain. tools/lguest: insert driver references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: insert device references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: rename virtio_pci_cfg_cap field to match spec. tools/lguest: fix features_accepted logic in example launcher. tools/lguest: handle device reset correctly in example launcher. virtual: Documentation: simplify and generalize paravirt_ops.txt lguest: remove NOTIFY call and eventfd facility. lguest: remove NOTIFY facility from demonstration launcher. lguest: use the PCI console device's emerg_wr for early boot messages. lguest: always put console in PCI slot #1. ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-18 09:24:01 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-18 09:24:01 -0800
commit: 53861af9a17022898619a2ae4ead0dfc601b7c13 (patch)
tree: dc11088d9e86fa1d8d8479974864153a8f976897 /drivers/lguest
parent: 5c2770079fb9b8c5bfb7113d9e76de66e77a0e24 (diff)
parent: 5b40a7daf51812b35cf05d1601a779a7043f8414 (diff)
download: linux-53861af9a17022898619a2ae4ead0dfc601b7c13.tar.gz
linux-53861af9a17022898619a2ae4ead0dfc601b7c13.tar.bz2
linux-53861af9a17022898619a2ae4ead0dfc601b7c13.zip
8 files changed, 252 insertions, 847 deletions
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index c4197503900e..16f52ee73994 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,6 +1,3 @@
-# Guest requires the device configuration and probing code.
-obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
-
 # Host requires the other files, which can be a module.
 obj-$(CONFIG_LGUEST)	+= lg.o
 lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 6590558d1d31..7dc93aa004c8 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
  */
 int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 {
+	/* If the launcher asked for a register with LHREQ_GETREG */
+	if (cpu->reg_read) {
+		if (put_user(*cpu->reg_read, user))
+			return -EFAULT;
+		cpu->reg_read = NULL;
+		return sizeof(*cpu->reg_read);
+	}
+
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
 		unsigned int irq;
@@ -217,21 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		if (cpu->hcall)
 			do_hypercalls(cpu);
 
-		/*
-		 * It's possible the Guest did a NOTIFY hypercall to the
-		 * Launcher.
-		 */
-		if (cpu->pending_notify) {
-			/*
-			 * Does it just needs to write to a registered
-			 * eventfd (ie. the appropriate virtqueue thread)?
-			 */
-			if (!send_notify_to_eventfd(cpu)) {
-				/* OK, we tell the main Launcher. */
-				if (put_user(cpu->pending_notify, user))
-					return -EFAULT;
-				return sizeof(cpu->pending_notify);
-			}
+		/* Do we have to tell the Launcher about a trap? */
+		if (cpu->pending.trap) {
+			if (copy_to_user(user, &cpu->pending,
+					 sizeof(cpu->pending)))
+				return -EFAULT;
+			return sizeof(cpu->pending);
 		}
 
 		/*
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 83511eb0923d..1219af493c0f 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -117,9 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		/* Similarly, this sets the halted flag for run_guest(). */
 		cpu->halted = 1;
 		break;
-	case LHCALL_NOTIFY:
-		cpu->pending_notify = args->arg1;
-		break;
 	default:
 		/* It should be an architecture-specific hypercall. */
 		if (lguest_arch_do_hcall(cpu, args))
@@ -189,7 +186,7 @@ static void do_async_hcalls(struct lg_cpu *cpu)
 		 * Stop doing hypercalls if they want to notify the Launcher:
 		 * it needs to service this first.
 		 */
-		if (cpu->pending_notify)
+		if (cpu->pending.trap)
 			break;
 	}
 }
@@ -280,7 +277,7 @@ void do_hypercalls(struct lg_cpu *cpu)
 	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
 	 * the hypercall.
 	 */
-	if (!cpu->pending_notify) {
+	if (!cpu->pending.trap) {
 		do_hcall(cpu, cpu->hcall);
 		/*
 		 * Tricky point: we reset the hcall pointer to mark the
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 2eef40be4c04..307e8b39e7d1 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -50,7 +50,10 @@ struct lg_cpu {
 	/* Bitmap of what has changed: see CHANGED_* above. */
 	int changed;
 
-	unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+	/* Pending operation. */
+	struct lguest_pending pending;
+
+	unsigned long *reg_read; /* register from LHREQ_GETREG */
 
 	/* At end of a page shared mapped over lguest_pages in guest. */
 	unsigned long regs_page;
@@ -78,24 +81,18 @@ struct lg_cpu {
 	struct lg_cpu_arch arch;
 };
 
-struct lg_eventfd {
-	unsigned long addr;
-	struct eventfd_ctx *event;
-};
-
-struct lg_eventfd_map {
-	unsigned int num;
-	struct lg_eventfd map[];
-};
-
 /* The private info the thread maintains about the guest. */
 struct lguest {
 	struct lguest_data __user *lguest_data;
 	struct lg_cpu cpus[NR_CPUS];
 	unsigned int nr_cpus;
 
+	/* Valid guest memory pages must be < this. */
 	u32 pfn_limit;
 
+	/* Device memory is >= pfn_limit and < device_limit. */
+	u32 device_limit;
+
 	/*
 	 * This provides the offset to the base of guest-physical memory in the
 	 * Launcher.
@@ -110,8 +107,6 @@ struct lguest {
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
-	struct lg_eventfd_map *eventfds;
-
 	/* Dead? */
 	const char *dead;
 };
@@ -197,8 +192,10 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
 		   unsigned long vaddr, pte_t val);
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
+bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
+		 unsigned long *iomem);
 void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
+bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
 unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
 void page_table_guest_data_init(struct lg_cpu *cpu);
 
@@ -210,6 +207,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu);
 int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
 int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
 void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
 
 /* <arch>/switcher.S: */
 extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
deleted file mode 100644
index 89088d6538fd..000000000000
--- a/drivers/lguest/lguest_device.c
+++ /dev/null
@@ -1,540 +0,0 @@
-/*P:050
- * Lguest guests use a very simple method to describe devices.  It's a
- * series of device descriptors contained just above the top of normal Guest
- * memory.
- *
- * We use the standard "virtio" device infrastructure, which provides us with a
- * console, a network and a block driver.  Each one expects some configuration
- * information and a "virtqueue" or two to send and receive data.
-:*/
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio.h>
-#include <linux/virtio_config.h>
-#include <linux/interrupt.h>
-#include <linux/virtio_ring.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <asm/paravirt.h>
-#include <asm/lguest_hcall.h>
-
-/* The pointer to our (page) of device descriptions. */
-static void *lguest_devices;
-
-/*
- * For Guests, device memory can be used as normal memory, so we cast away the
- * __iomem to quieten sparse.
- */
-static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
-{
-	return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages);
-}
-
-static inline void lguest_unmap(void *addr)
-{
-	iounmap((__force void __iomem *)addr);
-}
-
-/*D:100
- * Each lguest device is just a virtio device plus a pointer to its entry
- * in the lguest_devices page.
- */
-struct lguest_device {
-	struct virtio_device vdev;
-
-	/* The entry in the lguest_devices page for this device. */
-	struct lguest_device_desc *desc;
-};
-
-/*
- * Since the virtio infrastructure hands us a pointer to the virtio_device all
- * the time, it helps to have a curt macro to get a pointer to the struct
- * lguest_device it's enclosed in.
- */
-#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev)
-
-/*D:130
- * Device configurations
- *
- * The configuration information for a device consists of one or more
- * virtqueues, a feature bitmap, and some configuration bytes.  The
- * configuration bytes don't really matter to us: the Launcher sets them up, and
- * the driver will look at them during setup.
- *
- * A convenient routine to return the device's virtqueue config array:
- * immediately after the descriptor.
- */
-static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
-{
-	return (void *)(desc + 1);
-}
-
-/* The features come immediately after the virtqueues. */
-static u8 *lg_features(const struct lguest_device_desc *desc)
-{
-	return (void *)(lg_vq(desc) + desc->num_vq);
-}
-
-/* The config space comes after the two feature bitmasks. */
-static u8 *lg_config(const struct lguest_device_desc *desc)
-{
-	return lg_features(desc) + desc->feature_len * 2;
-}
-
-/* The total size of the config page used by this device (incl. desc) */
-static unsigned desc_size(const struct lguest_device_desc *desc)
-{
-	return sizeof(*desc)
-		+ desc->num_vq * sizeof(struct lguest_vqconfig)
-		+ desc->feature_len * 2
-		+ desc->config_len;
-}
-
-/* This gets the device's feature bits. */
-static u64 lg_get_features(struct virtio_device *vdev)
-{
-	unsigned int i;
-	u32 features = 0;
-	struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-	u8 *in_features = lg_features(desc);
-
-	/* We do this the slow but generic way. */
-	for (i = 0; i < min(desc->feature_len * 8, 32); i++)
-		if (in_features[i / 8] & (1 << (i % 8)))
-			features |= (1 << i);
-
-	return features;
-}
-
-/*
- * To notify on reset or feature finalization, we (ab)use the NOTIFY
- * hypercall, with the descriptor address of the device.
- */
-static void status_notify(struct virtio_device *vdev)
-{
-	unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
-
-	hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
-}
-
-/*
- * The virtio core takes the features the Host offers, and copies the ones
- * supported by the driver into the vdev->features array.  Once that's all
- * sorted out, this routine is called so we can tell the Host which features we
- * understand and accept.
- */
-static int lg_finalize_features(struct virtio_device *vdev)
-{
-	unsigned int i, bits;
-	struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-	/* Second half of bitmap is features we accept. */
-	u8 *out_features = lg_features(desc) + desc->feature_len;
-
-	/* Give virtio_ring a chance to accept features. */
-	vring_transport_features(vdev);
-
-	/* Make sure we don't have any features > 32 bits! */
-	BUG_ON((u32)vdev->features != vdev->features);
-
-	/*
-	 * Since lguest is currently x86-only, we're little-endian.  That
-	 * means we could just memcpy.  But it's not time critical, and in
-	 * case someone copies this code, we do it the slow, obvious way.
-	 */
-	memset(out_features, 0, desc->feature_len);
-	bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
-	for (i = 0; i < bits; i++) {
-		if (__virtio_test_bit(vdev, i))
-			out_features[i / 8] |= (1 << (i % 8));
-	}
-
-	/* Tell Host we've finished with this device's feature negotiation */
-	status_notify(vdev);
-
-	return 0;
-}
-
-/* Once they've found a field, getting a copy of it is easy. */
-static void lg_get(struct virtio_device *vdev, unsigned int offset,
-		   void *buf, unsigned len)
-{
-	struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-
-	/* Check they didn't ask for more than the length of the config! */
-	BUG_ON(offset + len > desc->config_len);
-	memcpy(buf, lg_config(desc) + offset, len);
-}
-
-/* Setting the contents is also trivial. */
-static void lg_set(struct virtio_device *vdev, unsigned int offset,
-		   const void *buf, unsigned len)
-{
-	struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-
-	/* Check they didn't ask for more than the length of the config! */
-	BUG_ON(offset + len > desc->config_len);
-	memcpy(lg_config(desc) + offset, buf, len);
-}
-
-/*
- * The operations to get and set the status word just access the status field
- * of the device descriptor.
- */
-static u8 lg_get_status(struct virtio_device *vdev)
-{
-	return to_lgdev(vdev)->desc->status;
-}
-
-static void lg_set_status(struct virtio_device *vdev, u8 status)
-{
-	BUG_ON(!status);
-	to_lgdev(vdev)->desc->status = status;
-
-	/* Tell Host immediately if we failed. */
-	if (status & VIRTIO_CONFIG_S_FAILED)
-		status_notify(vdev);
-}
-
-static void lg_reset(struct virtio_device *vdev)
-{
-	/* 0 status means "reset" */
-	to_lgdev(vdev)->desc->status = 0;
-	status_notify(vdev);
-}
-
-/*
- * Virtqueues
- *
- * The other piece of infrastructure virtio needs is a "virtqueue": a way of
- * the Guest device registering buffers for the other side to read from or
- * write into (ie. send and receive buffers).  Each device can have multiple
- * virtqueues: for example the console driver uses one queue for sending and
- * another for receiving.
- *
- * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
- * already exists in virtio_ring.c.  We just need to connect it up.
- *
- * We start with the information we need to keep about each virtqueue.
- */
-
-/*D:140 This is the information we remember about each virtqueue. */
-struct lguest_vq_info {
-	/* A copy of the information contained in the device config. */
-	struct lguest_vqconfig config;
-
-	/* The address where we mapped the virtio ring, so we can unmap it. */
-	void *pages;
-};
-
-/*
- * When the virtio_ring code wants to prod the Host, it calls us here and we
- * make a hypercall.  We hand the physical address of the virtqueue so the Host
- * knows which virtqueue we're talking about.
- */
-static bool lg_notify(struct virtqueue *vq)
-{
-	/*
-	 * We store our virtqueue information in the "priv" pointer of the
-	 * virtqueue structure.
-	 */
-	struct lguest_vq_info *lvq = vq->priv;
-
-	hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
-	return true;
-}
-
-/* An extern declaration inside a C file is bad form.  Don't do it. */
-extern int lguest_setup_irq(unsigned int irq);
-
-/*
- * This routine finds the Nth virtqueue described in the configuration of
- * this device and sets it up.
- *
- * This is kind of an ugly duckling.  It'd be nicer to have a standard
- * representation of a virtqueue in the configuration space, but it seems that
- * everyone wants to do it differently.  The KVM coders want the Guest to
- * allocate its own pages and tell the Host where they are, but for lguest it's
- * simpler for the Host to simply tell us where the pages are.
- */
-static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
-				    unsigned index,
-				    void (*callback)(struct virtqueue *vq),
-				    const char *name)
-{
-	struct lguest_device *ldev = to_lgdev(vdev);
-	struct lguest_vq_info *lvq;
-	struct virtqueue *vq;
-	int err;
-
-	if (!name)
-		return NULL;
-
-	/* We must have this many virtqueues. */
-	if (index >= ldev->desc->num_vq)
-		return ERR_PTR(-ENOENT);
-
-	lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
-	if (!lvq)
-		return ERR_PTR(-ENOMEM);
-
-	/*
-	 * Make a copy of the "struct lguest_vqconfig" entry, which sits after
-	 * the descriptor.  We need a copy because the config space might not
-	 * be aligned correctly.
-	 */
-	memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
-
-	printk("Mapping virtqueue %i addr %lx\n", index,
-	       (unsigned long)lvq->config.pfn << PAGE_SHIFT);
-	/* Figure out how many pages the ring will take, and map that memory */
-	lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
-				DIV_ROUND_UP(vring_size(lvq->config.num,
-							LGUEST_VRING_ALIGN),
-					     PAGE_SIZE));
-	if (!lvq->pages) {
-		err = -ENOMEM;
-		goto free_lvq;
-	}
-
-	/*
-	 * OK, tell virtio_ring.c to set up a virtqueue now we know its size
-	 * and we've got a pointer to its pages.  Note that we set weak_barriers
-	 * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
-	 * barriers.
-	 */
-	vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
-				 true, lvq->pages, lg_notify, callback, name);
-	if (!vq) {
-		err = -ENOMEM;
-		goto unmap;
-	}
-
-	/* Make sure the interrupt is allocated. */
-	err = lguest_setup_irq(lvq->config.irq);
-	if (err)
-		goto destroy_vring;
-
-	/*
-	 * Tell the interrupt for this virtqueue to go to the virtio_ring
-	 * interrupt handler.
-	 *
-	 * FIXME: We used to have a flag for the Host to tell us we could use
-	 * the interrupt as a source of randomness: it'd be nice to have that
-	 * back.
-	 */
-	err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
-			  dev_name(&vdev->dev), vq);
-	if (err)
-		goto free_desc;
-
-	/*
-	 * Last of all we hook up our 'struct lguest_vq_info" to the
-	 * virtqueue's priv pointer.
-	 */
-	vq->priv = lvq;
-	return vq;
-
-free_desc:
-	irq_free_desc(lvq->config.irq);
-destroy_vring:
-	vring_del_virtqueue(vq);
-unmap:
-	lguest_unmap(lvq->pages);
-free_lvq:
-	kfree(lvq);
-	return ERR_PTR(err);
-}
-/*:*/
-
-/* Cleaning up a virtqueue is easy */
-static void lg_del_vq(struct virtqueue *vq)
-{
-	struct lguest_vq_info *lvq = vq->priv;
-
-	/* Release the interrupt */
-	free_irq(lvq->config.irq, vq);
-	/* Tell virtio_ring.c to free the virtqueue. */
-	vring_del_virtqueue(vq);
-	/* Unmap the pages containing the ring. */
-	lguest_unmap(lvq->pages);
-	/* Free our own queue information. */
-	kfree(lvq);
-}
-
-static void lg_del_vqs(struct virtio_device *vdev)
-{
-	struct virtqueue *vq, *n;
-
-	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
-		lg_del_vq(vq);
-}
-
-static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-		       struct virtqueue *vqs[],
-		       vq_callback_t *callbacks[],
-		       const char *names[])
-{
-	struct lguest_device *ldev = to_lgdev(vdev);
-	int i;
-
-	/* We must have this many virtqueues. */
-	if (nvqs > ldev->desc->num_vq)
-		return -ENOENT;
-
-	for (i = 0; i < nvqs; ++i) {
-		vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
-		if (IS_ERR(vqs[i]))
-			goto error;
-	}
-	return 0;
-
-error:
-	lg_del_vqs(vdev);
-	return PTR_ERR(vqs[i]);
-}
-
-static const char *lg_bus_name(struct virtio_device *vdev)
-{
-	return "";
-}
-
-/* The ops structure which hooks everything together. */
-static const struct virtio_config_ops lguest_config_ops = {
-	.get_features = lg_get_features,
-	.finalize_features = lg_finalize_features,
-	.get = lg_get,
-	.set = lg_set,
-	.get_status = lg_get_status,
-	.set_status = lg_set_status,
-	.reset = lg_reset,
-	.find_vqs = lg_find_vqs,
-	.del_vqs = lg_del_vqs,
-	.bus_name = lg_bus_name,
-};
-
-/*
- * The root device for the lguest virtio devices.  This makes them appear as
- * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2.
- */
-static struct device *lguest_root;
-
-/*D:120
- * This is the core of the lguest bus: actually adding a new device.
- * It's a separate function because it's neater that way, and because an
- * earlier version of the code supported hotplug and unplug.  They were removed
- * early on because they were never used.
- *
- * As Andrew Tridgell says, "Untested code is buggy code".
- *
- * It's worth reading this carefully: we start with a pointer to the new device
- * descriptor in the "lguest_devices" page, and the offset into the device
- * descriptor page so we can uniquely identify it if things go badly wrong.
- */
-static void add_lguest_device(struct lguest_device_desc *d,
-			      unsigned int offset)
-{
-	struct lguest_device *ldev;
-
-	/* Start with zeroed memory; Linux's device layer counts on it. */
-	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
-	if (!ldev) {
-		printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n",
-		       offset, d->type);
-		return;
-	}
-
-	/* This devices' parent is the lguest/ dir. */
-	ldev->vdev.dev.parent = lguest_root;
-	/*
-	 * The device type comes straight from the descriptor.  There's also a
-	 * device vendor field in the virtio_device struct, which we leave as
-	 * 0.
-	 */
-	ldev->vdev.id.device = d->type;
-	/*
-	 * We have a simple set of routines for querying the device's
-	 * configuration information and setting its status.
-	 */
-	ldev->vdev.config = &lguest_config_ops;
-	/* And we remember the device's descriptor for lguest_config_ops. */
-	ldev->desc = d;
-
-	/*
-	 * register_virtio_device() sets up the generic fields for the struct
-	 * virtio_device and calls device_register().  This makes the bus
-	 * infrastructure look for a matching driver.
-	 */
-	if (register_virtio_device(&ldev->vdev) != 0) {
-		printk(KERN_ERR "Failed to register lguest dev %u type %u\n",
-		       offset, d->type);
-		kfree(ldev);
-	}
-}
-
-/*D:110
- * scan_devices() simply iterates through the device page.  The type 0 is
- * reserved to mean "end of devices".
- */
-static void scan_devices(void)
-{
-	unsigned int i;
-	struct lguest_device_desc *d;
-
-	/* We start at the page beginning, and skip over each entry. */
-	for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
-		d = lguest_devices + i;
-
-		/* Once we hit a zero, stop. */
-		if (d->type == 0)
-			break;
-
-		printk("Device at %i has size %u\n", i, desc_size(d));
-		add_lguest_device(d, i);
-	}
-}
-
-/*D:105
- * Fairly early in boot, lguest_devices_init() is called to set up the
- * lguest device infrastructure.  We check that we are a Guest by checking
- * pv_info.name: there are other ways of checking, but this seems most
- * obvious to me.
- *
- * So we can access the "struct lguest_device_desc"s easily, we map that memory
- * and store the pointer in the global "lguest_devices".  Then we register a
- * root device from which all our devices will hang (this seems to be the
- * correct sysfs incantation).
- *
- * Finally we call scan_devices() which adds all the devices found in the
- * lguest_devices page.
- */
-static int __init lguest_devices_init(void)
-{
-	if (strcmp(pv_info.name, "lguest") != 0)
-		return 0;
-
-	lguest_root = root_device_register("lguest");
-	if (IS_ERR(lguest_root))
-		panic("Could not register lguest root");
-
-	/* Devices are in a single page above top of "normal" mem */
-	lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
-
-	scan_devices();
-	return 0;
-}
-/* We do this after core stuff, but before the drivers. */
-postcore_initcall(lguest_devices_init);
-
-/*D:150
- * At this point in the journey we used to now wade through the lguest
- * devices themselves: net, block and console.  Since they're all now virtio
- * devices rather than lguest-specific, I've decided to ignore them.  Mostly,
- * they're kind of boring.  But this does mean you'll never experience the
- * thrill of reading the forbidden love scene buried deep in the block driver.
- *
- * "make Launcher" beckons, where we answer questions like "Where do Guests
- * come from?", and "What do you do when someone asks for optimization?".
- */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 4263f4cc8c55..c4c6113eb9a6 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -2,175 +2,62 @@
  * launcher controls and communicates with the Guest.  For example,
  * the first write will tell us the Guest's memory layout and entry
  * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest doing a NOTIFY out to the Launcher.  There is
- * also a way for the Launcher to attach eventfds to particular NOTIFY
- * values instead of returning from the read() call.
+ * a signal or the Guest accessing a device.
 :*/
 #include <linux/uaccess.h>
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/eventfd.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/export.h>
 #include "lg.h"
 
-/*L:056
- * Before we move on, let's jump ahead and look at what the kernel does when
- * it needs to look up the eventfds.  That will complete our picture of how we
- * use RCU.
- *
- * The notification value is in cpu->pending_notify: we return true if it went
- * to an eventfd.
- */
-bool send_notify_to_eventfd(struct lg_cpu *cpu)
-{
-	unsigned int i;
-	struct lg_eventfd_map *map;
-
-	/*
-	 * This "rcu_read_lock()" helps track when someone is still looking at
-	 * the (RCU-using) eventfds array.  It's not actually a lock at all;
-	 * indeed it's a noop in many configurations.  (You didn't expect me to
-	 * explain all the RCU secrets here, did you?)
-	 */
-	rcu_read_lock();
-	/*
-	 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
-	 * makes sure we don't access the memory pointed to by
-	 * cpu->lg->eventfds before cpu->lg->eventfds is set.  Sounds crazy,
-	 * but Alpha allows this!  Paul McKenney points out that a really
-	 * aggressive compiler could have the same effect:
-	 *   http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
-	 *
-	 * So play safe, use rcu_dereference to get the rcu-protected pointer:
-	 */
-	map = rcu_dereference(cpu->lg->eventfds);
-	/*
-	 * Simple array search: even if they add an eventfd while we do this,
-	 * we'll continue to use the old array and just won't see the new one.
-	 */
-	for (i = 0; i < map->num; i++) {
-		if (map->map[i].addr == cpu->pending_notify) {
-			eventfd_signal(map->map[i].event, 1);
-			cpu->pending_notify = 0;
-			break;
-		}
-	}
-	/* We're done with the rcu-protected variable cpu->lg->eventfds. */
-	rcu_read_unlock();
-
-	/* If we cleared the notification, it's because we found a match. */
-	return cpu->pending_notify == 0;
-}
-
-/*L:055
- * One of the more tricksy tricks in the Linux Kernel is a technique called
- * Read Copy Update.  Since one point of lguest is to teach lguest journeyers
- * about kernel coding, I use it here.  (In case you're curious, other purposes
- * include learning about virtualization and instilling a deep appreciation for
- * simplicity and puppies).
- *
- * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
- * add new eventfds without ever blocking readers from accessing the array.
- * The current Launcher only does this during boot, so that never happens.  But
- * Read Copy Update is cool, and adding a lock risks damaging even more puppies
- * than this code does.
- *
- * We allocate a brand new one-larger array, copy the old one and add our new
- * element.  Then we make the lg eventfd pointer point to the new array.
- * That's the easy part: now we need to free the old one, but we need to make
- * sure no slow CPU somewhere is still looking at it.  That's what
- * synchronize_rcu does for us: waits until every CPU has indicated that it has
- * moved on to know it's no longer using the old one.
- *
- * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
- */
-static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
+/*L:052
+  The Launcher can get the registers, and also set some of them.
+*/
+static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
 {
-	struct lg_eventfd_map *new, *old = lg->eventfds;
-
-	/*
-	 * We don't allow notifications on value 0 anyway (pending_notify of
-	 * 0 means "nothing pending").
-	 */
-	if (!addr)
-		return -EINVAL;
-
-	/*
-	 * Replace the old array with the new one, carefully: others can
-	 * be accessing it at the same time.
-	 */
-	new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
-		      GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
+	unsigned long which;
 
-	/* First make identical copy. */
-	memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
-	new->num = old->num;
-
-	/* Now append new entry. */
-	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_ctx_fdget(fd);
-	if (IS_ERR(new->map[new->num].event)) {
-		int err =  PTR_ERR(new->map[new->num].event);
-		kfree(new);
-		return err;
-	}
-	new->num++;
+	/* We re-use the ptrace structure to specify which register to read. */
+	if (get_user(which, input) != 0)
+		return -EFAULT;
 
 	/*
-	 * Now put new one in place: rcu_assign_pointer() is a fancy way of
-	 * doing "lg->eventfds = new", but it uses memory barriers to make
-	 * absolutely sure that the contents of "new" written above is nailed
-	 * down before we actually do the assignment.
+	 * We set up the cpu register pointer, and their next read will
+	 * actually get the value (instead of running the guest).
 	 *
-	 * We have to think about these kinds of things when we're operating on
-	 * live data without locks.
+	 * The last argument 'true' says we can access any register.
 	 */
-	rcu_assign_pointer(lg->eventfds, new);
+	cpu->reg_read = lguest_arch_regptr(cpu, which, true);
+	if (!cpu->reg_read)
+		return -ENOENT;
 
-	/*
-	 * We're not in a big hurry.  Wait until no one's looking at old
-	 * version, then free it.
-	 */
-	synchronize_rcu();
-	kfree(old);
-
-	return 0;
+	/* And because this is a write() call, we return the length used. */
+	return sizeof(unsigned long) * 2;
 }
 
-/*L:052
- * Receiving notifications from the Guest is usually done by attaching a
- * particular LHCALL_NOTIFY value to an event filedescriptor.  The eventfd will
- * become readable when the Guest does an LHCALL_NOTIFY with that value.
- *
- * This is really convenient for processing each virtqueue in a separate
- * thread.
- */
-static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
 {
-	unsigned long addr, fd;
-	int err;
+	unsigned long which, value, *reg;
 
-	if (get_user(addr, input) != 0)
+	/* We re-use the ptrace structure to specify which register to read. */
+	if (get_user(which, input) != 0)
 		return -EFAULT;
 	input++;
-	if (get_user(fd, input) != 0)
+	if (get_user(value, input) != 0)
 		return -EFAULT;
 
-	/*
-	 * Just make sure two callers don't add eventfds at once.  We really
-	 * only need to lock against callers adding to the same Guest, so using
-	 * the Big Lguest Lock is overkill.  But this is setup, not a fast path.
-	 */
-	mutex_lock(&lguest_lock);
-	err = add_eventfd(lg, addr, fd);
-	mutex_unlock(&lguest_lock);
+	/* The last argument 'false' means we can't access all registers. */
+	reg = lguest_arch_regptr(cpu, which, false);
+	if (!reg)
+		return -ENOENT;
 
-	return err;
+	*reg = value;
+
+	/* And because this is a write() call, we return the length used. */
+	return sizeof(unsigned long) * 3;
 }
 
 /*L:050
@@ -194,6 +81,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 	return 0;
 }
 
+/*L:053
+ * Deliver a trap: this is used by the Launcher if it can't emulate
+ * an instruction.
+ */
+static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
+{
+	unsigned long trapnum;
+
+	if (get_user(trapnum, input) != 0)
+		return -EFAULT;
+
+	if (!deliver_trap(cpu, trapnum))
+		return -EINVAL;
+
+	return 0;
+}
+
 /*L:040
  * Once our Guest is initialized, the Launcher makes it run by reading
  * from /dev/lguest.
@@ -237,8 +141,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 	 * If we returned from read() last time because the Guest sent I/O,
 	 * clear the flag.
 	 */
-	if (cpu->pending_notify)
-		cpu->pending_notify = 0;
+	if (cpu->pending.trap)
+		cpu->pending.trap = 0;
 
 	/* Run the Guest until something interesting happens. */
 	return run_guest(cpu, (unsigned long __user *)user);
@@ -319,7 +223,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	/* "struct lguest" contains all we (the Host) know about a Guest. */
 	struct lguest *lg;
 	int err;
-	unsigned long args[3];
+	unsigned long args[4];
 
 	/*
 	 * We grab the Big Lguest lock, which protects against multiple
@@ -343,21 +247,15 @@ static int initialize(struct file *file, const unsigned long __user *input)
 		goto unlock;
 	}
 
-	lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
-	if (!lg->eventfds) {
-		err = -ENOMEM;
-		goto free_lg;
-	}
-	lg->eventfds->num = 0;
-
 	/* Populate the easy fields of our "struct lguest" */
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
+	lg->device_limit = args[3];
 
 	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
-		goto free_eventfds;
+		goto free_lg;
 
 	/*
 	 * Initialize the Guest's shadow page tables.  This allocates
@@ -378,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
 	/* FIXME: This should be in free_vcpu */
 	free_page(lg->cpus[0].regs_page);
-free_eventfds:
-	kfree(lg->eventfds);
 free_lg:
 	kfree(lg);
 unlock:
@@ -432,8 +328,12 @@ static ssize_t write(struct file *file, const char __user *in,
 		return initialize(file, input);
 	case LHREQ_IRQ:
 		return user_send_irq(cpu, input);
-	case LHREQ_EVENTFD:
-		return attach_eventfd(lg, input);
+	case LHREQ_GETREG:
+		return getreg_setup(cpu, input);
+	case LHREQ_SETREG:
+		return setreg(cpu, input);
+	case LHREQ_TRAP:
+		return trap(cpu, input);
 	default:
 		return -EINVAL;
 	}
@@ -478,11 +378,6 @@ static int close(struct inode *inode, struct file *file)
 		mmput(lg->cpus[i].mm);
 	}
 
-	/* Release any eventfds they registered. */
-	for (i = 0; i < lg->eventfds->num; i++)
-		eventfd_ctx_put(lg->eventfds->map[i].event);
-	kfree(lg->eventfds);
-
 	/*
 	 * If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree().
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index e8b55c3a6170..e3abebc912c0 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -250,6 +250,16 @@ static void release_pte(pte_t pte)
 }
 /*:*/
 
+static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
+{
+	/* We don't handle large pages. */
+	if (pte_flags(gpte) & _PAGE_PSE)
+		return false;
+
+	return (pte_pfn(gpte) >= cpu->lg->pfn_limit
+		&& pte_pfn(gpte) < cpu->lg->device_limit);
+}
+
 static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
 {
 	if ((pte_flags(gpte) & _PAGE_PSE) ||
@@ -374,8 +384,14 @@ static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
  *
  * If we fixed up the fault (ie. we mapped the address), this routine returns
  * true.  Otherwise, it was a real fault and we need to tell the Guest.
+ *
+ * There's a corner case: they're trying to access memory between
+ * pfn_limit and device_limit, which is I/O memory.  In this case, we
+ * return false and set @iomem to the physical address, so the the
+ * Launcher can handle the instruction manually.
  */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
+		 unsigned long *iomem)
 {
 	unsigned long gpte_ptr;
 	pte_t gpte;
@@ -383,6 +399,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	pmd_t gpmd;
 	pgd_t gpgd;
 
+	*iomem = 0;
+
 	/* We never demand page the Switcher, so trying is a mistake. */
 	if (vaddr >= switcher_addr)
 		return false;
@@ -459,6 +477,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
 		return false;
 
+	/* If they're accessing io memory, we expect a fault. */
+	if (gpte_in_iomem(cpu, gpte)) {
+		*iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+		return false;
+	}
+
 	/*
 	 * Check that the Guest PTE flags are OK, and the page number is below
 	 * the pfn_limit (ie. not mapping the Launcher binary).
@@ -553,7 +577,9 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
  */
 void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
 {
-	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
+	unsigned long iomem;
+
+	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
 		kill_guest(cpu, "bad stack page %#lx", vaddr);
 }
 /*:*/
@@ -647,7 +673,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu)
 /*:*/
 
 /* We walk down the guest page tables to get a guest-physical address */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
+bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
 {
 	pgd_t gpgd;
 	pte_t gpte;
@@ -656,31 +682,47 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 #endif
 
 	/* Still not set up?  Just map 1:1. */
-	if (unlikely(cpu->linear_pages))
-		return vaddr;
+	if (unlikely(cpu->linear_pages)) {
+		*paddr = vaddr;
+		return true;
+	}
 
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) {
-		kill_guest(cpu, "Bad address %#lx", vaddr);
-		return -1UL;
-	}
+	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+		goto fail;
 
 #ifdef CONFIG_X86_PAE
 	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) {
-		kill_guest(cpu, "Bad address %#lx", vaddr);
-		return -1UL;
-	}
+	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+		goto fail;
 	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
 #else
 	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
 #endif
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		kill_guest(cpu, "Bad address %#lx", vaddr);
+		goto fail;
+
+	*paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+	return true;
+
+fail:
+	*paddr = -1UL;
+	return false;
+}
 
-	return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+/*
+ * This is the version we normally use: kills the Guest if it uses a
+ * bad address
+ */
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
+{
+	unsigned long paddr;
+
+	if (!__guest_pa(cpu, vaddr, &paddr))
+		kill_guest(cpu, "Bad address %#lx", vaddr);
+	return paddr;
 }
 
 /*
@@ -912,7 +954,8 @@ static void __guest_set_pte(struct lg_cpu *cpu, int idx,
 			 * now.  This shaves 10% off a copy-on-write
 			 * micro-benchmark.
 			 */
-			if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+			if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
+			    && !gpte_in_iomem(cpu, gpte)) {
 				if (!check_gpte(cpu, gpte))
 					return;
 				set_pte(spte,
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 6adfd7ba4c97..30f2aef69d78 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -182,6 +182,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
 }
 /*:*/
 
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
+{
+	switch (reg_off) {
+	case offsetof(struct pt_regs, bx):
+		return &cpu->regs->ebx;
+	case offsetof(struct pt_regs, cx):
+		return &cpu->regs->ecx;
+	case offsetof(struct pt_regs, dx):
+		return &cpu->regs->edx;
+	case offsetof(struct pt_regs, si):
+		return &cpu->regs->esi;
+	case offsetof(struct pt_regs, di):
+		return &cpu->regs->edi;
+	case offsetof(struct pt_regs, bp):
+		return &cpu->regs->ebp;
+	case offsetof(struct pt_regs, ax):
+		return &cpu->regs->eax;
+	case offsetof(struct pt_regs, ip):
+		return &cpu->regs->eip;
+	case offsetof(struct pt_regs, sp):
+		return &cpu->regs->esp;
+	}
+
+	/* Launcher can read these, but we don't allow any setting. */
+	if (any) {
+		switch (reg_off) {
+		case offsetof(struct pt_regs, ds):
+			return &cpu->regs->ds;
+		case offsetof(struct pt_regs, es):
+			return &cpu->regs->es;
+		case offsetof(struct pt_regs, fs):
+			return &cpu->regs->fs;
+		case offsetof(struct pt_regs, gs):
+			return &cpu->regs->gs;
+		case offsetof(struct pt_regs, cs):
+			return &cpu->regs->cs;
+		case offsetof(struct pt_regs, flags):
+			return &cpu->regs->eflags;
+		case offsetof(struct pt_regs, ss):
+			return &cpu->regs->ss;
+		}
+	}
+
+	return NULL;
+}
+
 /*M:002
  * There are hooks in the scheduler which we can register to tell when we
  * get kicked off the CPU (preempt_notifier_register()).  This would allow us
@@ -269,110 +315,73 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
  * usually attached to a PC.
  *
  * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We see if it's one of those troublesome
- * instructions and skip over it.  We return true if we did.
+ * Protection Fault) and come here.  We queue this to be sent out to the
+ * Launcher to handle.
  */
-static int emulate_insn(struct lg_cpu *cpu)
-{
-	u8 insn;
-	unsigned int insnlen = 0, in = 0, small_operand = 0;
-	/*
-	 * The eip contains the *virtual* address of the Guest's instruction:
-	 * walk the Guest's page tables to find the "physical" address.
-	 */
-	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
-
-	/*
-	 * This must be the Guest kernel trying to do something, not userspace!
-	 * The bottom two bits of the CS segment register are the privilege
-	 * level.
-	 */
-	if ((cpu->regs->cs & 3) != GUEST_PL)
-		return 0;
-
-	/* Decoding x86 instructions is icky. */
-	insn = lgread(cpu, physaddr, u8);
 
-	/*
-	 * Around 2.6.33, the kernel started using an emulation for the
-	 * cmpxchg8b instruction in early boot on many configurations.  This
-	 * code isn't paravirtualized, and it tries to disable interrupts.
-	 * Ignore it, which will Mostly Work.
-	 */
-	if (insn == 0xfa) {
-		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-		cpu->regs->eip++;
-		return 1;
+/*
+ * The eip contains the *virtual* address of the Guest's instruction:
+ * we copy the instruction here so the Launcher doesn't have to walk
+ * the page tables to decode it.  We handle the case (eg. in a kernel
+ * module) where the instruction is over two pages, and the pages are
+ * virtually but not physically contiguous.
+ *
+ * The longest possible x86 instruction is 15 bytes, but we don't handle
+ * anything that strange.
+ */
+static void copy_from_guest(struct lg_cpu *cpu,
+			    void *dst, unsigned long vaddr, size_t len)
+{
+	size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
+	unsigned long paddr;
+
+	BUG_ON(len > PAGE_SIZE);
+
+	/* If it goes over a page, copy in two parts. */
+	if (len > to_page_end) {
+		/* But make sure the next page is mapped! */
+		if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
+			copy_from_guest(cpu, dst + to_page_end,
+					vaddr + to_page_end,
+					len - to_page_end);
+		else
+			/* Otherwise fill with zeroes. */
+			memset(dst + to_page_end, 0, len - to_page_end);
+		len = to_page_end;
 	}
 
-	/*
-	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-	 */
-	if (insn == 0x66) {
-		small_operand = 1;
-		/* The instruction is 1 byte so far, read the next byte. */
-		insnlen = 1;
-		insn = lgread(cpu, physaddr + insnlen, u8);
-	}
+	/* This will kill the guest if it isn't mapped, but that
+	 * shouldn't happen. */
+	__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
+}
 
-	/*
-	 * We can ignore the lower bit for the moment and decode the 4 opcodes
-	 * we need to emulate.
-	 */
-	switch (insn & 0xFE) {
-	case 0xE4: /* in     <next byte>,%al */
-		insnlen += 2;
-		in = 1;
-		break;
-	case 0xEC: /* in     (%dx),%al */
-		insnlen += 1;
-		in = 1;
-		break;
-	case 0xE6: /* out    %al,<next byte> */
-		insnlen += 2;
-		break;
-	case 0xEE: /* out    %al,(%dx) */
-		insnlen += 1;
-		break;
-	default:
-		/* OK, we don't know what this is, can't emulate. */
-		return 0;
-	}
 
-	/*
-	 * If it was an "IN" instruction, they expect the result to be read
-	 * into %eax, so we change %eax.  We always return all-ones, which
-	 * traditionally means "there's nothing there".
-	 */
-	if (in) {
-		/* Lower bit tells means it's a 32/16 bit access */
-		if (insn & 0x1) {
-			if (small_operand)
-				cpu->regs->eax |= 0xFFFF;
-			else
-				cpu->regs->eax = 0xFFFFFFFF;
-		} else
-			cpu->regs->eax |= 0xFF;
-	}
-	/* Finally, we've "done" the instruction, so move past it. */
-	cpu->regs->eip += insnlen;
-	/* Success! */
-	return 1;
+static void setup_emulate_insn(struct lg_cpu *cpu)
+{
+	cpu->pending.trap = 13;
+	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
+			sizeof(cpu->pending.insn));
+}
+
+static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
+{
+	cpu->pending.trap = 14;
+	cpu->pending.addr = iomem_addr;
+	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
+			sizeof(cpu->pending.insn));
 }
 
 /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
 void lguest_arch_handle_trap(struct lg_cpu *cpu)
 {
+	unsigned long iomem_addr;
+
 	switch (cpu->regs->trapnum) {
 	case 13: /* We've intercepted a General Protection Fault. */
-		/*
-		 * Check if this was one of those annoying IN or OUT
-		 * instructions which we need to emulate.  If so, we just go
-		 * back into the Guest after we've done it.
-		 */
+		/* Hand to Launcher to emulate those pesky IN and OUT insns */
 		if (cpu->regs->errcode == 0) {
-			if (emulate_insn(cpu))
-				return;
+			setup_emulate_insn(cpu);
+			return;
 		}
 		break;
 	case 14: /* We've intercepted a Page Fault. */
@@ -387,9 +396,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 		 * whether kernel or userspace code.
 		 */
 		if (demand_page(cpu, cpu->arch.last_pagefault,
-				cpu->regs->errcode))
+				cpu->regs->errcode, &iomem_addr))
 			return;
 
+		/* Was this an access to memory mapped IO? */
+		if (iomem_addr) {
+			/* Tell Launcher, let it handle it. */
+			setup_iomem_insn(cpu, iomem_addr);
+			return;
+		}
+
 		/*
 		 * OK, it's really not there (or not OK): the Guest needs to
 		 * know.  We write out the cr2 value so it knows where the
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-18 09:24:01 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-18 09:24:01 -0800
commit	53861af9a17022898619a2ae4ead0dfc601b7c13 (patch)
tree	dc11088d9e86fa1d8d8479974864153a8f976897 /drivers/lguest
parent	5c2770079fb9b8c5bfb7113d9e76de66e77a0e24 (diff)
parent	5b40a7daf51812b35cf05d1601a779a7043f8414 (diff)
download	linux-53861af9a17022898619a2ae4ead0dfc601b7c13.tar.gz linux-53861af9a17022898619a2ae4ead0dfc601b7c13.tar.bz2 linux-53861af9a17022898619a2ae4ead0dfc601b7c13.zip