summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 11:10:11 -0700
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 11:10:11 -0700
commitfb9fc395174138983a49f2da982ed14caabbe741 (patch)
tree5d5d3643ee6853a899205613da272cc343fdc1a4
parent0eafaae84e21ac033815cc9f33c3ae889cd7ccfe (diff)
parentace2e92e193126711cb3a83a3752b2c5b8396950 (diff)
downloadlinux-stable-fb9fc395174138983a49f2da982ed14caabbe741.tar.gz
linux-stable-fb9fc395174138983a49f2da982ed14caabbe741.tar.bz2
linux-stable-fb9fc395174138983a49f2da982ed14caabbe741.zip
Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xfs: eagerly remove vmap mappings to avoid upsetting Xen xen: add some debug output for failed multicalls xen: fix incorrect vcpu_register_vcpu_info hypercall argument xen: ask the hypervisor how much space it needs reserved xen: lock pte pages while pinning/unpinning xen: deal with stale cr3 values when unpinning pagetables xen: add batch completion callbacks xen: yield to IPI target if necessary Clean up duplicate includes in arch/i386/xen/ remove dead code in pgtable_cache_init paravirt: clean up lazy mode handling paravirt: refactor struct paravirt_ops into smaller pv_*_ops
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c14
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/paravirt_32.c224
-rw-r--r--arch/x86/kernel/vmi_32.c201
-rw-r--r--arch/x86/mm/init_32.c22
-rw-r--r--arch/x86/xen/enlighten.c233
-rw-r--r--arch/x86/xen/mmu.c145
-rw-r--r--arch/x86/xen/multicalls.c52
-rw-r--r--arch/x86/xen/multicalls.h5
-rw-r--r--arch/x86/xen/smp.c14
-rw-r--r--arch/x86/xen/time.c6
-rw-r--r--arch/x86/xen/xen-ops.h10
-rw-r--r--drivers/char/hvc_lguest.c2
-rw-r--r--drivers/lguest/core.c6
-rw-r--r--drivers/lguest/lguest.c152
-rw-r--r--drivers/lguest/lguest_bus.c2
-rw-r--r--include/asm-x86/paravirt.h487
-rw-r--r--include/asm-x86/pgtable-3level-defs.h2
-rw-r--r--include/xen/interface/vcpu.h5
-rw-r--r--mm/Kconfig1
21 files changed, 960 insertions, 629 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 11b03d3c6fda..42421437ded3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -369,8 +369,8 @@ void apply_paravirt(struct paravirt_patch_site *start,
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insnbuf, p->instr, p->len);
- used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
- (unsigned long)p->instr, p->len);
+ used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
+ (unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 8029742c0fc1..f1b7cdda82b3 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -116,12 +116,14 @@ void foo(void)
#ifdef CONFIG_PARAVIRT
BLANK();
- OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
- OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
- OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
- OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
- OFFSET(PARAVIRT_iret, paravirt_ops, iret);
- OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+ OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
+ OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+ OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+ OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+ OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+ OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+ OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+ OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
#endif
#ifdef CONFIG_XEN
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8099fea0a72f..dc7f938e5015 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -437,7 +437,7 @@ ldt_ss:
* is still available to implement the setting of the high
* 16-bits in the INTERRUPT_RETURN paravirt-op.
*/
- cmpl $0, paravirt_ops+PARAVIRT_enabled
+ cmpl $0, pv_info+PARAVIRT_enabled
jne restore_nocheck
#endif
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 739cfb207dd7..6a80d67c2121 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -42,32 +42,33 @@ void _paravirt_nop(void)
static void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
- paravirt_ops.name);
+ pv_info.name);
}
char *memory_setup(void)
{
- return paravirt_ops.memory_setup();
+ return pv_init_ops.memory_setup();
}
/* Simple instruction patching code. */
-#define DEF_NATIVE(name, code) \
- extern const char start_##name[], end_##name[]; \
- asm("start_" #name ": " code "; end_" #name ":")
-
-DEF_NATIVE(irq_disable, "cli");
-DEF_NATIVE(irq_enable, "sti");
-DEF_NATIVE(restore_fl, "push %eax; popf");
-DEF_NATIVE(save_fl, "pushf; pop %eax");
-DEF_NATIVE(iret, "iret");
-DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(clts, "clts");
-DEF_NATIVE(read_tsc, "rdtsc");
-
-DEF_NATIVE(ud2a, "ud2a");
+#define DEF_NATIVE(ops, name, code) \
+ extern const char start_##ops##_##name[], end_##ops##_##name[]; \
+ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+
+/* Undefined instruction for dealing with missing ops pointers. */
+static const unsigned char ud2a[] = { 0x0f, 0x0b };
static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned ret;
switch(type) {
-#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
- SITE(irq_disable);
- SITE(irq_enable);
- SITE(restore_fl);
- SITE(save_fl);
- SITE(iret);
- SITE(irq_enable_sysexit);
- SITE(read_cr2);
- SITE(read_cr3);
- SITE(write_cr3);
- SITE(clts);
- SITE(read_tsc);
+#define SITE(ops, x) \
+ case PARAVIRT_PATCH(ops.x): \
+ start = start_##ops##_##x; \
+ end = end_##ops##_##x; \
+ goto patch_site
+
+ SITE(pv_irq_ops, irq_disable);
+ SITE(pv_irq_ops, irq_enable);
+ SITE(pv_irq_ops, restore_fl);
+ SITE(pv_irq_ops, save_fl);
+ SITE(pv_cpu_ops, iret);
+ SITE(pv_cpu_ops, irq_enable_sysexit);
+ SITE(pv_mmu_ops, read_cr2);
+ SITE(pv_mmu_ops, read_cr3);
+ SITE(pv_mmu_ops, write_cr3);
+ SITE(pv_cpu_ops, clts);
+ SITE(pv_cpu_ops, read_tsc);
#undef SITE
patch_site:
ret = paravirt_patch_insns(ibuf, len, start, end);
break;
- case PARAVIRT_PATCH(make_pgd):
- case PARAVIRT_PATCH(make_pte):
- case PARAVIRT_PATCH(pgd_val):
- case PARAVIRT_PATCH(pte_val):
-#ifdef CONFIG_X86_PAE
- case PARAVIRT_PATCH(make_pmd):
- case PARAVIRT_PATCH(pmd_val):
-#endif
- /* These functions end up returning exactly what
- they're passed, in the same registers. */
- ret = paravirt_patch_nop();
- break;
-
default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
@@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf,
return 5;
}
-unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
unsigned long addr, unsigned len)
{
struct branch *b = insnbuf;
@@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
return 5;
}
+/* Neat trick to map patch type back to the call within the
+ * corresponding structure. */
+static void *get_call_destination(u8 type)
+{
+ struct paravirt_patch_template tmpl = {
+ .pv_init_ops = pv_init_ops,
+ .pv_time_ops = pv_time_ops,
+ .pv_cpu_ops = pv_cpu_ops,
+ .pv_irq_ops = pv_irq_ops,
+ .pv_apic_ops = pv_apic_ops,
+ .pv_mmu_ops = pv_mmu_ops,
+ };
+ return *((void **)&tmpl + type);
+}
+
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
unsigned long addr, unsigned len)
{
- void *opfunc = *((void **)&paravirt_ops + type);
+ void *opfunc = get_call_destination(type);
unsigned ret;
if (opfunc == NULL)
/* If there's no function, patch it with a ud2a (BUG) */
- ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
+ ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
else if (opfunc == paravirt_nop)
/* If the operation is a nop, then nop the callsite */
ret = paravirt_patch_nop();
- else if (type == PARAVIRT_PATCH(iret) ||
- type == PARAVIRT_PATCH(irq_enable_sysexit))
+ else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+ type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
/* If operation requires a jmp, then jmp */
- ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
+ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
else
/* Otherwise call the function; assume target could
clobber any caller-save reg */
@@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
void init_IRQ(void)
{
- paravirt_ops.init_IRQ();
+ pv_irq_ops.init_IRQ();
}
static void native_flush_tlb(void)
@@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void);
static int __init print_banner(void)
{
- paravirt_ops.banner();
+ pv_init_ops.banner();
return 0;
}
core_initcall(print_banner);
@@ -273,47 +281,96 @@ int paravirt_disable_iospace(void)
return ret;
}
-struct paravirt_ops paravirt_ops = {
+static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+
+static inline void enter_lazy(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+ BUG_ON(preemptible());
+
+ x86_write_percpu(paravirt_lazy_mode, mode);
+}
+
+void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
+ BUG_ON(preemptible());
+
+ x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+}
+
+void paravirt_enter_lazy_mmu(void)
+{
+ enter_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_leave_lazy_mmu(void)
+{
+ paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_enter_lazy_cpu(void)
+{
+ enter_lazy(PARAVIRT_LAZY_CPU);
+}
+
+void paravirt_leave_lazy_cpu(void)
+{
+ paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+}
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
+{
+ return x86_read_percpu(paravirt_lazy_mode);
+}
+
+struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
.kernel_rpl = 0,
.shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
+};
- .patch = native_patch,
+struct pv_init_ops pv_init_ops = {
+ .patch = native_patch,
.banner = default_banner,
.arch_setup = paravirt_nop,
.memory_setup = machine_specific_memory_setup,
+};
+
+struct pv_time_ops pv_time_ops = {
+ .time_init = hpet_time_init,
.get_wallclock = native_get_wallclock,
.set_wallclock = native_set_wallclock,
- .time_init = hpet_time_init,
+ .sched_clock = native_sched_clock,
+ .get_cpu_khz = native_calculate_cpu_khz,
+};
+
+struct pv_irq_ops pv_irq_ops = {
.init_IRQ = native_init_IRQ,
+ .save_fl = native_save_fl,
+ .restore_fl = native_restore_fl,
+ .irq_disable = native_irq_disable,
+ .irq_enable = native_irq_enable,
+ .safe_halt = native_safe_halt,
+ .halt = native_halt,
+};
+struct pv_cpu_ops pv_cpu_ops = {
.cpuid = native_cpuid,
.get_debugreg = native_get_debugreg,
.set_debugreg = native_set_debugreg,
.clts = native_clts,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
- .read_cr2 = native_read_cr2,
- .write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
- .write_cr3 = native_write_cr3,
.read_cr4 = native_read_cr4,
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = native_write_cr4,
- .save_fl = native_save_fl,
- .restore_fl = native_restore_fl,
- .irq_disable = native_irq_disable,
- .irq_enable = native_irq_enable,
- .safe_halt = native_safe_halt,
- .halt = native_halt,
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
- .sched_clock = native_sched_clock,
- .get_cpu_khz = native_calculate_cpu_khz,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
@@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = {
.write_idt_entry = write_dt_entry,
.load_esp0 = native_load_esp0,
+ .irq_enable_sysexit = native_irq_enable_sysexit,
+ .iret = native_iret,
+
.set_iopl_mask = native_set_iopl_mask,
.io_delay = native_io_delay,
+ .lazy_mode = {
+ .enter = paravirt_nop,
+ .leave = paravirt_nop,
+ },
+};
+
+struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = native_apic_write,
.apic_write_atomic = native_apic_write_atomic,
@@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = {
.setup_secondary_clock = setup_secondary_APIC_clock,
.startup_ipi_hook = paravirt_nop,
#endif
- .set_lazy_mode = paravirt_nop,
+};
+struct pv_mmu_ops pv_mmu_ops = {
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
+ .read_cr2 = native_read_cr2,
+ .write_cr2 = native_write_cr2,
+ .read_cr3 = native_read_cr3,
+ .write_cr3 = native_write_cr3,
+
.flush_tlb_user = native_flush_tlb,
.flush_tlb_kernel = native_flush_tlb_global,
.flush_tlb_single = native_flush_tlb_single,
@@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = {
.make_pte = native_make_pte,
.make_pgd = native_make_pgd,
- .irq_enable_sysexit = native_irq_enable_sysexit,
- .iret = native_iret,
-
.dup_mmap = paravirt_nop,
.exit_mmap = paravirt_nop,
.activate_mm = paravirt_nop,
+
+ .lazy_mode = {
+ .enter = paravirt_nop,
+ .leave = paravirt_nop,
+ },
};
-EXPORT_SYMBOL(paravirt_ops);
+EXPORT_SYMBOL_GPL(pv_time_ops);
+EXPORT_SYMBOL_GPL(pv_cpu_ops);
+EXPORT_SYMBOL_GPL(pv_mmu_ops);
+EXPORT_SYMBOL_GPL(pv_apic_ops);
+EXPORT_SYMBOL_GPL(pv_info);
+EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 18673e0f193b..f02bad68abaa 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
unsigned long eip, unsigned len)
{
switch (type) {
- case PARAVIRT_PATCH(irq_disable):
+ case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
return patch_internal(VMI_CALL_DisableInterrupts, len,
insns, eip);
- case PARAVIRT_PATCH(irq_enable):
+ case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
return patch_internal(VMI_CALL_EnableInterrupts, len,
insns, eip);
- case PARAVIRT_PATCH(restore_fl):
+ case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
return patch_internal(VMI_CALL_SetInterruptMask, len,
insns, eip);
- case PARAVIRT_PATCH(save_fl):
+ case PARAVIRT_PATCH(pv_irq_ops.save_fl):
return patch_internal(VMI_CALL_GetInterruptMask, len,
insns, eip);
- case PARAVIRT_PATCH(iret):
+ case PARAVIRT_PATCH(pv_cpu_ops.iret):
return patch_internal(VMI_CALL_IRET, len, insns, eip);
- case PARAVIRT_PATCH(irq_enable_sysexit):
+ case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
default:
break;
@@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
}
#endif
-static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
+static void vmi_enter_lazy_cpu(void)
{
- static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
-
- if (!vmi_ops.set_lazy_mode)
- return;
+ paravirt_enter_lazy_cpu();
+ vmi_ops.set_lazy_mode(2);
+}
- /* Modes should never nest or overlap */
- BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
- mode == PARAVIRT_LAZY_FLUSH));
+static void vmi_enter_lazy_mmu(void)
+{
+ paravirt_enter_lazy_mmu();
+ vmi_ops.set_lazy_mode(1);
+}
- if (mode == PARAVIRT_LAZY_FLUSH) {
- vmi_ops.set_lazy_mode(0);
- vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
- } else {
- vmi_ops.set_lazy_mode(mode);
- __get_cpu_var(lazy_mode) = mode;
- }
+static void vmi_leave_lazy(void)
+{
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
+ vmi_ops.set_lazy_mode(0);
}
static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -690,9 +688,9 @@ do { \
reloc = call_vrom_long_func(vmi_rom, get_reloc, \
VMI_CALL_##vmicall); \
if (rel->type == VMI_RELOCATION_CALL_REL) \
- paravirt_ops.opname = (void *)rel->eip; \
+ opname = (void *)rel->eip; \
else if (rel->type == VMI_RELOCATION_NOP) \
- paravirt_ops.opname = (void *)vmi_nop; \
+ opname = (void *)vmi_nop; \
else if (rel->type != VMI_RELOCATION_NONE) \
printk(KERN_WARNING "VMI: Unknown relocation " \
"type %d for " #vmicall"\n",\
@@ -712,7 +710,7 @@ do { \
VMI_CALL_##vmicall); \
BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
if (rel->type == VMI_RELOCATION_CALL_REL) { \
- paravirt_ops.opname = wrapper; \
+ opname = wrapper; \
vmi_ops.cache = (void *)rel->eip; \
} \
} while (0)
@@ -732,11 +730,11 @@ static inline int __init activate_vmi(void)
}
savesegment(cs, kernel_cs);
- paravirt_ops.paravirt_enabled = 1;
- paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+ pv_info.paravirt_enabled = 1;
+ pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+ pv_info.name = "vmi";
- paravirt_ops.patch = vmi_patch;
- paravirt_ops.name = "vmi";
+ pv_init_ops.patch = vmi_patch;
/*
* Many of these operations are ABI compatible with VMI.
@@ -754,26 +752,26 @@ static inline int __init activate_vmi(void)
*/
/* CPUID is special, so very special it gets wrapped like a present */
- para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
-
- para_fill(clts, CLTS);
- para_fill(get_debugreg, GetDR);
- para_fill(set_debugreg, SetDR);
- para_fill(read_cr0, GetCR0);
- para_fill(read_cr2, GetCR2);
- para_fill(read_cr3, GetCR3);
- para_fill(read_cr4, GetCR4);
- para_fill(write_cr0, SetCR0);
- para_fill(write_cr2, SetCR2);
- para_fill(write_cr3, SetCR3);
- para_fill(write_cr4, SetCR4);
- para_fill(save_fl, GetInterruptMask);
- para_fill(restore_fl, SetInterruptMask);
- para_fill(irq_disable, DisableInterrupts);
- para_fill(irq_enable, EnableInterrupts);
-
- para_fill(wbinvd, WBINVD);
- para_fill(read_tsc, RDTSC);
+ para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
+
+ para_fill(pv_cpu_ops.clts, CLTS);
+ para_fill(pv_cpu_ops.get_debugreg, GetDR);
+ para_fill(pv_cpu_ops.set_debugreg, SetDR);
+ para_fill(pv_cpu_ops.read_cr0, GetCR0);
+ para_fill(pv_mmu_ops.read_cr2, GetCR2);
+ para_fill(pv_mmu_ops.read_cr3, GetCR3);
+ para_fill(pv_cpu_ops.read_cr4, GetCR4);
+ para_fill(pv_cpu_ops.write_cr0, SetCR0);
+ para_fill(pv_mmu_ops.write_cr2, SetCR2);
+ para_fill(pv_mmu_ops.write_cr3, SetCR3);
+ para_fill(pv_cpu_ops.write_cr4, SetCR4);
+ para_fill(pv_irq_ops.save_fl, GetInterruptMask);
+ para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
+ para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
+ para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
+
+ para_fill(pv_cpu_ops.wbinvd, WBINVD);
+ para_fill(pv_cpu_ops.read_tsc, RDTSC);
/* The following we emulate with trap and emulate for now */
/* paravirt_ops.read_msr = vmi_rdmsr */
@@ -781,29 +779,38 @@ static inline int __init activate_vmi(void)
/* paravirt_ops.rdpmc = vmi_rdpmc */
/* TR interface doesn't pass TR value, wrap */
- para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
+ para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
/* LDT is special, too */
- para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
- para_fill(load_gdt, SetGDT);
- para_fill(load_idt, SetIDT);
- para_fill(store_gdt, GetGDT);
- para_fill(store_idt, GetIDT);
- para_fill(store_tr, GetTR);
- paravirt_ops.load_tls = vmi_load_tls;
- para_fill(write_ldt_entry, WriteLDTEntry);
- para_fill(write_gdt_entry, WriteGDTEntry);
- para_fill(write_idt_entry, WriteIDTEntry);
- para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
- para_fill(set_iopl_mask, SetIOPLMask);
- para_fill(io_delay, IODelay);
- para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
+ para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
+
+ para_fill(pv_cpu_ops.load_gdt, SetGDT);
+ para_fill(pv_cpu_ops.load_idt, SetIDT);
+ para_fill(pv_cpu_ops.store_gdt, GetGDT);
+ para_fill(pv_cpu_ops.store_idt, GetIDT);
+ para_fill(pv_cpu_ops.store_tr, GetTR);
+ pv_cpu_ops.load_tls = vmi_load_tls;
+ para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
+ para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
+ para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
+ para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+ para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
+ para_fill(pv_cpu_ops.io_delay, IODelay);
+
+ para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+ set_lazy_mode, SetLazyMode);
+ para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+ set_lazy_mode, SetLazyMode);
+
+ para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
+ set_lazy_mode, SetLazyMode);
+ para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+ set_lazy_mode, SetLazyMode);
/* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
- para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
- para_fill(flush_tlb_single, InvalPage);
+ para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+ para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
+ para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
/*
* Until a standard flag format can be agreed on, we need to
@@ -819,41 +826,41 @@ static inline int __init activate_vmi(void)
#endif
if (vmi_ops.set_pte) {
- paravirt_ops.set_pte = vmi_set_pte;
- paravirt_ops.set_pte_at = vmi_set_pte_at;
- paravirt_ops.set_pmd = vmi_set_pmd;
+ pv_mmu_ops.set_pte = vmi_set_pte;
+ pv_mmu_ops.set_pte_at = vmi_set_pte_at;
+ pv_mmu_ops.set_pmd = vmi_set_pmd;
#ifdef CONFIG_X86_PAE
- paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
- paravirt_ops.set_pte_present = vmi_set_pte_present;
- paravirt_ops.set_pud = vmi_set_pud;
- paravirt_ops.pte_clear = vmi_pte_clear;
- paravirt_ops.pmd_clear = vmi_pmd_clear;
+ pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
+ pv_mmu_ops.set_pte_present = vmi_set_pte_present;
+ pv_mmu_ops.set_pud = vmi_set_pud;
+ pv_mmu_ops.pte_clear = vmi_pte_clear;
+ pv_mmu_ops.pmd_clear = vmi_pmd_clear;
#endif
}
if (vmi_ops.update_pte) {
- paravirt_ops.pte_update = vmi_update_pte;
- paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+ pv_mmu_ops.pte_update = vmi_update_pte;
+ pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
}
vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
if (vmi_ops.allocate_page) {
- paravirt_ops.alloc_pt = vmi_allocate_pt;
- paravirt_ops.alloc_pd = vmi_allocate_pd;
- paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+ pv_mmu_ops.alloc_pt = vmi_allocate_pt;
+ pv_mmu_ops.alloc_pd = vmi_allocate_pd;
+ pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
}
vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
if (vmi_ops.release_page) {
- paravirt_ops.release_pt = vmi_release_pt;
- paravirt_ops.release_pd = vmi_release_pd;
+ pv_mmu_ops.release_pt = vmi_release_pt;
+ pv_mmu_ops.release_pd = vmi_release_pd;
}
/* Set linear is needed in all cases */
vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
#ifdef CONFIG_HIGHPTE
if (vmi_ops.set_linear_mapping)
- paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+ pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
#endif
/*
@@ -863,17 +870,17 @@ static inline int __init activate_vmi(void)
* the backend. They are performance critical anyway, so requiring
* a patch is not a big problem.
*/
- paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
- paravirt_ops.iret = (void *)0xbadbab0;
+ pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+ pv_cpu_ops.iret = (void *)0xbadbab0;
#ifdef CONFIG_SMP
- para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
+ para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic_read, APICRead);
- para_fill(apic_write, APICWrite);
- para_fill(apic_write_atomic, APICWrite);
+ para_fill(pv_apic_ops.apic_read, APICRead);
+ para_fill(pv_apic_ops.apic_write, APICWrite);
+ para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
#endif
/*
@@ -891,15 +898,15 @@ static inline int __init activate_vmi(void)
vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
vmi_timer_ops.cancel_alarm =
vmi_get_function(VMI_CALL_CancelAlarm);
- paravirt_ops.time_init = vmi_time_init;
- paravirt_ops.get_wallclock = vmi_get_wallclock;
- paravirt_ops.set_wallclock = vmi_set_wallclock;
+ pv_time_ops.time_init = vmi_time_init;
+ pv_time_ops.get_wallclock = vmi_get_wallclock;
+ pv_time_ops.set_wallclock = vmi_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
- paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
+ pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
+ pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
#endif
- paravirt_ops.sched_clock = vmi_sched_clock;
- paravirt_ops.get_cpu_khz = vmi_cpu_khz;
+ pv_time_ops.sched_clock = vmi_sched_clock;
+ pv_time_ops.get_cpu_khz = vmi_cpu_khz;
/* We have true wallclock functions; disable CMOS clock sync */
no_sync_cmos_clock = 1;
@@ -908,7 +915,7 @@ static inline int __init activate_vmi(void)
disable_vmi_timer = 1;
}
- para_fill(safe_halt, Halt);
+ para_fill(pv_irq_ops.safe_halt, Halt);
/*
* Alternative instruction rewriting doesn't happen soon enough
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index dda4e83649a0..33d367a3432e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -741,24 +741,12 @@ struct kmem_cache *pmd_cache;
void __init pgtable_cache_init(void)
{
- size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
-
- if (PTRS_PER_PMD > 1) {
+ if (PTRS_PER_PMD > 1)
pmd_cache = kmem_cache_create("pmd",
- PTRS_PER_PMD*sizeof(pmd_t),
- PTRS_PER_PMD*sizeof(pmd_t),
- SLAB_PANIC,
- pmd_ctor);
- if (!SHARED_KERNEL_PMD) {
- /* If we're in PAE mode and have a non-shared
- kernel pmd, then the pgd size must be a
- page size. This is because the pgd_list
- links through the page structure, so there
- can only be one pgd per page for this to
- work. */
- pgd_size = PAGE_SIZE;
- }
- }
+ PTRS_PER_PMD*sizeof(pmd_t),
+ PTRS_PER_PMD*sizeof(pmd_t),
+ SLAB_PANIC,
+ pmd_ctor);
}
/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 493a083f6886..94c39aaf695f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -25,7 +25,6 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
-#include <linux/smp.h>
#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
@@ -52,11 +51,25 @@
EXPORT_SYMBOL_GPL(hypercall_page);
-DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
-
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3. This may not be the current effective cr3, because
+ * its update may be being lazily deferred. However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early). If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
@@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
info.mfn = virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+ printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
cpu, vcpup, info.mfn, info.offset);
/* Check to see if the hypervisor will put the vcpu_info
@@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
static void __init xen_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
- paravirt_ops.name);
+ pv_info.name);
printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
}
@@ -249,29 +262,10 @@ static void xen_halt(void)
xen_safe_halt();
}
-static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+static void xen_leave_lazy(void)
{
- BUG_ON(preemptible());
-
- switch (mode) {
- case PARAVIRT_LAZY_NONE:
- BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
- break;
-
- case PARAVIRT_LAZY_MMU:
- case PARAVIRT_LAZY_CPU:
- BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
- break;
-
- case PARAVIRT_LAZY_FLUSH:
- /* flush if necessary, but don't change state */
- if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
- xen_mc_flush();
- return;
- }
-
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
- x86_write_percpu(xen_lazy_mode, mode);
}
static unsigned long xen_store_tr(void)
@@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
* loaded properly. This will go away as soon as Xen has been
* modified to not save/restore %gs for normal hypercalls.
*/
- if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
loadsegment(gs, 0);
}
@@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void)
return x86_read_percpu(xen_cr3);
}
+static void set_current_cr3(void *v)
+{
+ x86_write_percpu(xen_current_cr3, (unsigned long)v);
+}
+
static void xen_write_cr3(unsigned long cr3)
{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+ unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
BUG_ON(preemptible());
- if (cr3 == x86_read_percpu(xen_cr3)) {
- /* just a simple tlb flush */
- xen_flush_tlb();
- return;
- }
+ mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
x86_write_percpu(xen_cr3, cr3);
+ op = mcs.args;
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = mfn;
- {
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
- unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
-
- op = mcs.args;
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = mfn;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ /* Update xen_update_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
- }
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
/* Early in boot, while setting up the initial pagetable, assume
@@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}
+static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
+{
+ struct mmuext_op op;
+ op.cmd = level;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
- else
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+ } else
/* make sure there are no stray mappings of
this page */
kmap_flush_unused();
@@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn)
struct page *page = pfn_to_page(pfn);
if (PagePinned(page)) {
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ }
}
}
@@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
/* special set_pte for pagetable initialization */
- paravirt_ops.set_pte = xen_set_pte_init;
+ pv_mmu_ops.set_pte = xen_set_pte_init;
init_mm.pgd = base;
/*
@@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
{
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
- paravirt_ops.alloc_pt = xen_alloc_pt;
- paravirt_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.alloc_pt = xen_alloc_pt;
+ pv_mmu_ops.set_pte = xen_set_pte;
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/*
@@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
{
- struct mmuext_op op;
+ unsigned level;
+
#ifdef CONFIG_X86_PAE
- op.cmd = MMUEXT_PIN_L3_TABLE;
+ level = MMUEXT_PIN_L3_TABLE;
#else
- op.cmd = MMUEXT_PIN_L3_TABLE;
+ level = MMUEXT_PIN_L2_TABLE;
#endif
- op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
+
+ pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
}
}
@@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
if (have_vcpu_info_placement) {
printk(KERN_INFO "Xen: using vcpu_info placement\n");
- paravirt_ops.save_fl = xen_save_fl_direct;
- paravirt_ops.restore_fl = xen_restore_fl_direct;
- paravirt_ops.irq_disable = xen_irq_disable_direct;
- paravirt_ops.irq_enable = xen_irq_enable_direct;
- paravirt_ops.read_cr2 = xen_read_cr2_direct;
- paravirt_ops.iret = xen_iret_direct;
+ pv_irq_ops.save_fl = xen_save_fl_direct;
+ pv_irq_ops.restore_fl = xen_restore_fl_direct;
+ pv_irq_ops.irq_disable = xen_irq_disable_direct;
+ pv_irq_ops.irq_enable = xen_irq_enable_direct;
+ pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
+ pv_cpu_ops.iret = xen_iret_direct;
}
}
@@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
start = end = reloc = NULL;
-#define SITE(x) \
- case PARAVIRT_PATCH(x): \
+#define SITE(op, x) \
+ case PARAVIRT_PATCH(op.x): \
if (have_vcpu_info_placement) { \
start = (char *)xen_##x##_direct; \
end = xen_##x##_direct_end; \
@@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
goto patch_site
switch (type) {
- SITE(irq_enable);
- SITE(irq_disable);
- SITE(save_fl);
- SITE(restore_fl);
+ SITE(pv_irq_ops, irq_enable);
+ SITE(pv_irq_ops, irq_disable);
+ SITE(pv_irq_ops, save_fl);
+ SITE(pv_irq_ops, restore_fl);
#undef SITE
patch_site:
@@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
-static const struct paravirt_ops xen_paravirt_ops __initdata = {
+static const struct pv_info xen_info __initdata = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
.name = "Xen",
- .banner = xen_banner,
+};
+static const struct pv_init_ops xen_init_ops __initdata = {
.patch = xen_patch,
+ .banner = xen_banner,
.memory_setup = xen_memory_setup,
.arch_setup = xen_arch_setup,
- .init_IRQ = xen_init_IRQ,
.post_allocator_init = xen_mark_init_mm_pinned,
+};
+static const struct pv_time_ops xen_time_ops __initdata = {
.time_init = xen_time_init,
+
.set_wallclock = xen_set_wallclock,
.get_wallclock = xen_get_wallclock,
.get_cpu_khz = xen_cpu_khz,
.sched_clock = xen_sched_clock,
+};
+static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.cpuid = xen_cpuid,
.set_debugreg = xen_set_debugreg,
@@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
- .read_cr2 = xen_read_cr2,
- .write_cr2 = xen_write_cr2,
-
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3,
-
.read_cr4 = native_read_cr4,
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
@@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay,
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_cpu,
+ .leave = xen_leave_lazy,
+ },
+};
+
+static const struct pv_irq_ops xen_irq_ops __initdata = {
+ .init_IRQ = xen_init_IRQ,
+ .save_fl = xen_save_fl,
+ .restore_fl = xen_restore_fl,
+ .irq_disable = xen_irq_disable,
+ .irq_enable = xen_irq_enable,
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+};
+
+static const struct pv_apic_ops xen_apic_ops __initdata = {
#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = xen_apic_write,
.apic_write_atomic = xen_apic_write,
@@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.setup_secondary_clock = paravirt_nop,
.startup_ipi_hook = paravirt_nop,
#endif
+};
+
+static const struct pv_mmu_ops xen_mmu_ops __initdata = {
+ .pagetable_setup_start = xen_pagetable_setup_start,
+ .pagetable_setup_done = xen_pagetable_setup_done,
+
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3,
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
@@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
- .pagetable_setup_start = xen_pagetable_setup_start,
- .pagetable_setup_done = xen_pagetable_setup_done,
-
.alloc_pt = xen_alloc_pt_init,
.release_pt = xen_release_pt,
.alloc_pd = paravirt_nop,
@@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.dup_mmap = xen_dup_mmap,
.exit_mmap = xen_exit_mmap,
- .set_lazy_mode = xen_set_lazy_mode,
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy,
+ },
};
#ifdef CONFIG_SMP
@@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
};
+static void __init xen_reserve_top(void)
+{
+ unsigned long top = HYPERVISOR_VIRT_START;
+ struct xen_platform_parameters pp;
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+ top = pp.virt_start;
+
+ reserve_top_address(-top + 2 * PAGE_SIZE);
+}
+
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
/* Install Xen paravirt ops */
- paravirt_ops = xen_paravirt_ops;
+ pv_info = xen_info;
+ pv_init_ops = xen_init_ops;
+ pv_time_ops = xen_time_ops;
+ pv_cpu_ops = xen_cpu_ops;
+ pv_irq_ops = xen_irq_ops;
+ pv_apic_ops = xen_apic_ops;
+ pv_mmu_ops = xen_mmu_ops;
+
machine_ops = xen_machine_ops;
#ifdef CONFIG_SMP
@@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
/* keep using Xen gdt for now; no urgent need to change it */
x86_write_percpu(xen_cr3, __pa(pgd));
+ x86_write_percpu(xen_current_cr3, __pa(pgd));
#ifdef CONFIG_SMP
/* Don't do the full vcpu_info placement stuff until we have a
@@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
xen_setup_vcpu_info_placement();
#endif
- paravirt_ops.kernel_rpl = 1;
+ pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
- paravirt_ops.kernel_rpl = 0;
+ pv_info.kernel_rpl = 0;
/* set the limit of our address space */
- reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+ xen_reserve_top();
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 874db0cd1d2a..b2e32f9d0071 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,6 @@
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/bug.h>
-#include <linux/sched.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
if (mm == current->mm || mm == &init_mm) {
- if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
struct multicall_space mcs;
mcs = xen_mc_entry(0);
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
}
#endif /* CONFIG_X86_PAE */
-
+enum pt_level {
+ PT_PGD,
+ PT_PUD,
+ PT_PMD,
+ PT_PTE
+};
/*
(Yet another) pagetable walker. This one is intended for pinning a
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
FIXADDR_TOP. But the important bit is that we don't pin beyond
there, because then we start getting into Xen's ptes.
*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
unsigned long limit)
{
pgd_t *pgd = pgd_base;
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
pud = pud_offset(pgd, 0);
if (PTRS_PER_PUD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pud), 0);
+ flush |= (*func)(virt_to_page(pud), PT_PUD);
for (; addr != pud_limit; pud++, addr = pud_next) {
pmd_t *pmd;
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
pmd = pmd_offset(pud, 0);
if (PTRS_PER_PMD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pmd), 0);
+ flush |= (*func)(virt_to_page(pmd), PT_PMD);
for (; addr != pmd_limit; pmd++) {
addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
if (pmd_none(*pmd))
continue;
- flush |= (*func)(pmd_page(*pmd), 0);
+ flush |= (*func)(pmd_page(*pmd), PT_PTE);
}
}
}
- flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+ flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
return flush;
}
-static int pin_page(struct page *page, unsigned flags)
+static spinlock_t *lock_pte(struct page *page)
+{
+ spinlock_t *ptl = NULL;
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+ ptl = __pte_lockptr(page);
+ spin_lock(ptl);
+#endif
+
+ return ptl;
+}
+
+static void do_unlock(void *v)
+{
+ spinlock_t *ptl = v;
+ spin_unlock(ptl);
+}
+
+static void xen_do_pin(unsigned level, unsigned long pfn)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = level;
+ op->arg1.mfn = pfn_to_mfn(pfn);
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+}
+
+static int pin_page(struct page *page, enum pt_level level)
{
unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
int flush;
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0);
+ spinlock_t *ptl;
flush = 0;
+ ptl = NULL;
+ if (level == PT_PTE)
+ ptl = lock_pte(page);
+
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL_RO),
- flags);
+ level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+ if (level == PT_PTE)
+ xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
+
+ if (ptl) {
+ /* Queue a deferred unlock for when this batch
+ is completed. */
+ xen_mc_callback(do_unlock, ptl);
+ }
}
return flush;
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
- struct multicall_space mcs;
- struct mmuext_op *op;
+ unsigned level;
xen_mc_batch();
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_batch();
}
- mcs = __xen_mc_entry(sizeof(*op));
- op = mcs.args;
-
#ifdef CONFIG_X86_PAE
- op->cmd = MMUEXT_PIN_L3_TABLE;
+ level = MMUEXT_PIN_L3_TABLE;
#else
- op->cmd = MMUEXT_PIN_L2_TABLE;
+ level = MMUEXT_PIN_L2_TABLE;
#endif
- op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_do_pin(level, PFN_DOWN(__pa(pgd)));
xen_mc_issue(0);
}
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
/* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all
the book-keeping now. */
-static __init int mark_pinned(struct page *page, unsigned flags)
+static __init int mark_pinned(struct page *page, enum pt_level level)
{
SetPagePinned(page);
return 0;
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}
-static int unpin_page(struct page *page, unsigned flags)
+static int unpin_page(struct page *page, enum pt_level level)
{
unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
if (pgfl && !PageHighMem(page)) {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
- struct multicall_space mcs = __xen_mc_entry(0);
+ spinlock_t *ptl = NULL;
+ struct multicall_space mcs;
+
+ if (level == PT_PTE) {
+ ptl = lock_pte(page);
+
+ xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+ }
+
+ mcs = __xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL),
- flags);
+ level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+ if (ptl) {
+ /* unlock when batch completed */
+ xen_mc_callback(do_unlock, ptl);
+ }
}
return 0; /* never need to flush on unpin */
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
xen_mc_batch();
- mcs = __xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_UNPIN_TABLE;
- op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
pgd_walk(pgd, unpin_page, TASK_SIZE);
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)
if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
leave_mm(smp_processor_id());
+
+ /* If this cpu still has a stale cr3 reference, then make sure
+ it has been flushed. */
+ if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ load_cr3(swapper_pg_dir);
+ arch_flush_lazy_cpu_mode();
+ }
}
static void drop_mm_ref(struct mm_struct *mm)
{
+ cpumask_t mask;
+ unsigned cpu;
+
if (current->active_mm == mm) {
if (current->mm == mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
+ arch_flush_lazy_cpu_mode();
}
- if (!cpus_empty(mm->cpu_vm_mask))
- xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
- mm, 1);
+ /* Get the "official" set of cpus referring to our pagetable. */
+ mask = mm->cpu_vm_mask;
+
+ /* It's possible that a vcpu may have a stale reference to our
+ cr3, because its in lazy mode, and it hasn't yet flushed
+ its set of pending hypercalls yet. In this case, we can
+ look at its actual current cr3 value, and force it to flush
+ if needed. */
+ for_each_online_cpu(cpu) {
+ if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
+ cpu_set(cpu, mask);
+ }
+
+ if (!cpus_empty(mask))
+ xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
/* pgd may not be pinned in the error exit path of execve */
if (PagePinned(virt_to_page(mm->pgd)))
xen_pgd_unpin(mm->pgd);
+
spin_unlock(&mm->page_table_lock);
}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c837e8e463db..5e6f36f6d876 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@
#include "multicalls.h"
+#define MC_DEBUG 1
+
#define MC_BATCH 32
#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
+#if MC_DEBUG
+ struct multicall_entry debug[MC_BATCH];
+#endif
u64 args[MC_ARGS];
- unsigned mcidx, argidx;
+ struct callback {
+ void (*fn)(void *);
+ void *data;
+ } callbacks[MC_BATCH];
+ unsigned mcidx, argidx, cbidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
int ret = 0;
unsigned long flags;
+ int i;
BUG_ON(preemptible());
@@ -51,13 +61,31 @@ void xen_mc_flush(void)
local_irq_save(flags);
if (b->mcidx) {
- int i;
+#if MC_DEBUG
+ memcpy(b->debug, b->entries,
+ b->mcidx * sizeof(struct multicall_entry));
+#endif
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
+
+#if MC_DEBUG
+ if (ret) {
+ printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
+ ret, smp_processor_id());
+ for(i = 0; i < b->mcidx; i++) {
+ printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+ i+1, b->mcidx,
+ b->debug[i].op,
+ b->debug[i].args[0],
+ b->entries[i].result);
+ }
+ }
+#endif
+
b->mcidx = 0;
b->argidx = 0;
} else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)
local_irq_restore(flags);
+ for(i = 0; i < b->cbidx; i++) {
+ struct callback *cb = &b->callbacks[i];
+
+ (*cb->fn)(cb->data);
+ }
+ b->cbidx = 0;
+
BUG_ON(ret);
}
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
return ret;
}
+
+void xen_mc_callback(void (*fn)(void *), void *data)
+{
+ struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct callback *cb;
+
+ if (b->cbidx == MC_BATCH)
+ xen_mc_flush();
+
+ cb = &b->callbacks[b->cbidx++];
+ cb->fn = fn;
+ cb->data = data;
+}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index e6f7530b156c..8bae996d99a3 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
- if ((xen_get_lazy_mode() & mode) == 0)
+ if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
}
+/* Set up a callback to be called when the current batch is flushed */
+void xen_mc_callback(void (*fn)(void *), void *data);
+
#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4fa33c27ccb6..d53bf9d8a72d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait)
{
struct call_data_struct data;
- int cpus;
+ int cpus, cpu;
+ bool yield;
/* Holding any lock stops cpus from going down. */
spin_lock(&call_lock);
@@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
/* Send a message to other CPUs and wait for them to respond */
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
- /* Make sure other vcpus get a chance to run.
- XXX too severe? Maybe we should check the other CPU's states? */
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ /* Make sure other vcpus get a chance to run if they need to. */
+ yield = false;
+ for_each_cpu_mask(cpu, mask)
+ if (xen_vcpu_stolen(cpu))
+ yield = true;
+
+ if (yield)
+ HYPERVISOR_sched_op(SCHEDOP_yield, 0);
/* Wait for response */
while (atomic_read(&data.started) != cpus ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index dfd6db69ead5..d083ff5ef088 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
} while (get64(&state->state_entry_time) != state_time);
}
+/* return true when a vcpu could run but has no real cpu to run on */
+bool xen_vcpu_stolen(int vcpu)
+{
+ return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
+}
+
static void setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b9aaea45f07f..b02a909bfd4c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
DECLARE_PER_CPU(unsigned long, xen_cr3);
+DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
-void xen_mark_init_mm_pinned(void);
-
-DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+bool xen_vcpu_stolen(int vcpu);
-static inline unsigned xen_get_lazy_mode(void)
-{
- return x86_read_percpu(xen_lazy_mode);
-}
+void xen_mark_init_mm_pinned(void);
void __init xen_fill_possible_map(void);
diff --git a/drivers/char/hvc_lguest.c b/drivers/char/hvc_lguest.c
index 3d6bd0baa56d..efccb2155830 100644
--- a/drivers/char/hvc_lguest.c
+++ b/drivers/char/hvc_lguest.c
@@ -115,7 +115,7 @@ static struct hv_ops lguest_cons = {
* (0), and the struct hv_ops containing the put_chars() function. */
static int __init cons_init(void)
{
- if (strcmp(paravirt_ops.name, "lguest") != 0)
+ if (strcmp(pv_info.name, "lguest") != 0)
return 0;
return hvc_instantiate(0, 0, &lguest_cons);
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4a315f08a567..a0788c12b392 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -248,8 +248,8 @@ static void unmap_switcher(void)
}
/*H:130 Our Guest is usually so well behaved; it never tries to do things it
- * isn't allowed to. Unfortunately, "struct paravirt_ops" isn't quite
- * complete, because it doesn't contain replacements for the Intel I/O
+ * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't
+ * quite complete, because it doesn't contain replacements for the Intel I/O
* instructions. As a result, the Guest sometimes fumbles across one during
* the boot process as it probes for various things which are usually attached
* to a PC.
@@ -694,7 +694,7 @@ static int __init init(void)
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
if (paravirt_enabled()) {
- printk("lguest is afraid of %s\n", paravirt_ops.name);
+ printk("lguest is afraid of %s\n", pv_info.name);
return -EPERM;
}
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index 4a579c840301..3ba337dde857 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -23,7 +23,7 @@
*
* So how does the kernel know it's a Guest? The Guest starts at a special
* entry point marked with a magic string, which sets up a few things then
- * calls here. We replace the native functions in "struct paravirt_ops"
+ * calls here. We replace the native functions various "paravirt" structures
* with our Guest versions, then boot like normal. :*/
/*
@@ -97,29 +97,17 @@ static cycle_t clock_base;
* them as a batch when lazy_mode is eventually turned off. Because hypercalls
* are reasonably expensive, batching them up makes sense. For example, a
* large mmap might update dozens of page table entries: that code calls
- * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls
- * lguest_lazy_mode(PARAVIRT_LAZY_NONE).
+ * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
+ * lguest_leave_lazy_mode().
*
* So, when we're in lazy mode, we call async_hypercall() to store the call for
* future processing. When lazy mode is turned off we issue a hypercall to
* flush the stored calls.
- *
- * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which
- * indicates we're to flush any outstanding calls immediately. This is used
- * when an interrupt handler does a kmap_atomic(): the page table changes must
- * happen immediately even if we're in the middle of a batch. Usually we're
- * not, though, so there's nothing to do. */
-static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */
-static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
+ */
+static void lguest_leave_lazy_mode(void)
{
- if (mode == PARAVIRT_LAZY_FLUSH) {
- if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE))
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
- } else {
- lazy_mode = mode;
- if (mode == PARAVIRT_LAZY_NONE)
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
- }
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
+ hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
}
static void lazy_hcall(unsigned long call,
@@ -127,7 +115,7 @@ static void lazy_hcall(unsigned long call,
unsigned long arg2,
unsigned long arg3)
{
- if (lazy_mode == PARAVIRT_LAZY_NONE)
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
hcall(call, arg1, arg2, arg3);
else
async_hcall(call, arg1, arg2, arg3);
@@ -331,7 +319,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
}
/*G:038 That's enough excitement for now, back to ploughing through each of
- * the paravirt_ops (we're about 1/3 of the way through).
+ * the different pv_ops structures (we're about 1/3 of the way through).
*
* This is the Local Descriptor Table, another weird Intel thingy. Linux only
* uses this for some strange applications like Wine. We don't do anything
@@ -558,7 +546,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
}
-/* Unfortunately for Lguest, the paravirt_ops for page tables were based on
+/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
* native page table operations. On native hardware you can set a new page
* table entry whenever you want, but if you want to remove one you have to do
* a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -782,7 +770,7 @@ static void lguest_time_init(void)
clocksource_register(&lguest_clock);
/* Now we've set up our clock, we can use it as the scheduler clock */
- paravirt_ops.sched_clock = lguest_sched_clock;
+ pv_time_ops.sched_clock = lguest_sched_clock;
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
@@ -904,7 +892,7 @@ static __init char *lguest_memory_setup(void)
/*G:050
* Patching (Powerfully Placating Performance Pedants)
*
- * We have already seen that "struct paravirt_ops" lets us replace simple
+ * We have already seen that pv_ops structures let us replace simple
* native instructions with calls to the appropriate back end all throughout
* the kernel. This allows the same kernel to run as a Guest and as a native
* kernel, but it's slow because of all the indirect branches.
@@ -929,10 +917,10 @@ static const struct lguest_insns
{
const char *start, *end;
} lguest_insns[] = {
- [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti },
- [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
- [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
+ [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
+ [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
+ [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
+ [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
};
/* Now our patch routine is fairly simple (based on the native one in
@@ -959,9 +947,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
return insn_len;
}
-/*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops
- * structure in the kernel provides a single point for (almost) every routine
- * we have to override to avoid privileged instructions. */
+/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
+ * structures in the kernel provide points for (almost) every routine we have
+ * to override to avoid privileged instructions. */
__init void lguest_init(void *boot)
{
/* Copy boot parameters first: the Launcher put the physical location
@@ -976,54 +964,70 @@ __init void lguest_init(void *boot)
/* We're under lguest, paravirt is enabled, and we're running at
* privilege level 1, not 0 as normal. */
- paravirt_ops.name = "lguest";
- paravirt_ops.paravirt_enabled = 1;
- paravirt_ops.kernel_rpl = 1;
+ pv_info.name = "lguest";
+ pv_info.paravirt_enabled = 1;
+ pv_info.kernel_rpl = 1;
/* We set up all the lguest overrides for sensitive operations. These
* are detailed with the operations themselves. */
- paravirt_ops.save_fl = save_fl;
- paravirt_ops.restore_fl = restore_fl;
- paravirt_ops.irq_disable = irq_disable;
- paravirt_ops.irq_enable = irq_enable;
- paravirt_ops.load_gdt = lguest_load_gdt;
- paravirt_ops.memory_setup = lguest_memory_setup;
- paravirt_ops.cpuid = lguest_cpuid;
- paravirt_ops.write_cr3 = lguest_write_cr3;
- paravirt_ops.flush_tlb_user = lguest_flush_tlb_user;
- paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
- paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
- paravirt_ops.set_pte = lguest_set_pte;
- paravirt_ops.set_pte_at = lguest_set_pte_at;
- paravirt_ops.set_pmd = lguest_set_pmd;
+
+ /* interrupt-related operations */
+ pv_irq_ops.init_IRQ = lguest_init_IRQ;
+ pv_irq_ops.save_fl = save_fl;
+ pv_irq_ops.restore_fl = restore_fl;
+ pv_irq_ops.irq_disable = irq_disable;
+ pv_irq_ops.irq_enable = irq_enable;
+ pv_irq_ops.safe_halt = lguest_safe_halt;
+
+ /* init-time operations */
+ pv_init_ops.memory_setup = lguest_memory_setup;
+ pv_init_ops.patch = lguest_patch;
+
+ /* Intercepts of various cpu instructions */
+ pv_cpu_ops.load_gdt = lguest_load_gdt;
+ pv_cpu_ops.cpuid = lguest_cpuid;
+ pv_cpu_ops.load_idt = lguest_load_idt;
+ pv_cpu_ops.iret = lguest_iret;
+ pv_cpu_ops.load_esp0 = lguest_load_esp0;
+ pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
+ pv_cpu_ops.set_ldt = lguest_set_ldt;
+ pv_cpu_ops.load_tls = lguest_load_tls;
+ pv_cpu_ops.set_debugreg = lguest_set_debugreg;
+ pv_cpu_ops.clts = lguest_clts;
+ pv_cpu_ops.read_cr0 = lguest_read_cr0;
+ pv_cpu_ops.write_cr0 = lguest_write_cr0;
+ pv_cpu_ops.read_cr4 = lguest_read_cr4;
+ pv_cpu_ops.write_cr4 = lguest_write_cr4;
+ pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
+ pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
+ pv_cpu_ops.wbinvd = lguest_wbinvd;
+ pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
+ pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+
+ /* pagetable management */
+ pv_mmu_ops.write_cr3 = lguest_write_cr3;
+ pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
+ pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
+ pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+ pv_mmu_ops.set_pte = lguest_set_pte;
+ pv_mmu_ops.set_pte_at = lguest_set_pte_at;
+ pv_mmu_ops.set_pmd = lguest_set_pmd;
+ pv_mmu_ops.read_cr2 = lguest_read_cr2;
+ pv_mmu_ops.read_cr3 = lguest_read_cr3;
+ pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
+ pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+
#ifdef CONFIG_X86_LOCAL_APIC
- paravirt_ops.apic_write = lguest_apic_write;
- paravirt_ops.apic_write_atomic = lguest_apic_write;
- paravirt_ops.apic_read = lguest_apic_read;
+ /* apic read/write intercepts */
+ pv_apic_ops.apic_write = lguest_apic_write;
+ pv_apic_ops.apic_write_atomic = lguest_apic_write;
+ pv_apic_ops.apic_read = lguest_apic_read;
#endif
- paravirt_ops.load_idt = lguest_load_idt;
- paravirt_ops.iret = lguest_iret;
- paravirt_ops.load_esp0 = lguest_load_esp0;
- paravirt_ops.load_tr_desc = lguest_load_tr_desc;
- paravirt_ops.set_ldt = lguest_set_ldt;
- paravirt_ops.load_tls = lguest_load_tls;
- paravirt_ops.set_debugreg = lguest_set_debugreg;
- paravirt_ops.clts = lguest_clts;
- paravirt_ops.read_cr0 = lguest_read_cr0;
- paravirt_ops.write_cr0 = lguest_write_cr0;
- paravirt_ops.init_IRQ = lguest_init_IRQ;
- paravirt_ops.read_cr2 = lguest_read_cr2;
- paravirt_ops.read_cr3 = lguest_read_cr3;
- paravirt_ops.read_cr4 = lguest_read_cr4;
- paravirt_ops.write_cr4 = lguest_write_cr4;
- paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
- paravirt_ops.write_idt_entry = lguest_write_idt_entry;
- paravirt_ops.patch = lguest_patch;
- paravirt_ops.safe_halt = lguest_safe_halt;
- paravirt_ops.get_wallclock = lguest_get_wallclock;
- paravirt_ops.time_init = lguest_time_init;
- paravirt_ops.set_lazy_mode = lguest_lazy_mode;
- paravirt_ops.wbinvd = lguest_wbinvd;
+
+ /* time operations */
+ pv_time_ops.get_wallclock = lguest_get_wallclock;
+ pv_time_ops.time_init = lguest_time_init;
+
/* Now is a good time to look at the implementations of these functions
* before returning to the rest of lguest_init(). */
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c
index 9e7752cc8002..57329788f8a7 100644
--- a/drivers/lguest/lguest_bus.c
+++ b/drivers/lguest/lguest_bus.c
@@ -201,7 +201,7 @@ static void scan_devices(void)
* "struct lguest_device_desc" array. */
static int __init lguest_bus_init(void)
{
- if (strcmp(paravirt_ops.name, "lguest") != 0)
+ if (strcmp(pv_info.name, "lguest") != 0)
return 0;
/* Devices are in a single page above top of "normal" mem */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 9fa3fa9e62d1..f59d370c5df4 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -25,27 +25,22 @@ struct tss_struct;
struct mm_struct;
struct desc_struct;
-/* Lazy mode for batching updates / context switch */
-enum paravirt_lazy_mode {
- PARAVIRT_LAZY_NONE = 0,
- PARAVIRT_LAZY_MMU = 1,
- PARAVIRT_LAZY_CPU = 2,
- PARAVIRT_LAZY_FLUSH = 3,
-};
-
-struct paravirt_ops
-{
+/* general info */
+struct pv_info {
unsigned int kernel_rpl;
int shared_kernel_pmd;
- int paravirt_enabled;
+ int paravirt_enabled;
const char *name;
+};
+struct pv_init_ops {
/*
- * Patch may replace one of the defined code sequences with arbitrary
- * code, subject to the same register constraints. This generally
- * means the code is not free to clobber any registers other than EAX.
- * The patch function should return the number of bytes of code
- * generated, as we nop pad the rest in generic code.
+ * Patch may replace one of the defined code sequences with
+ * arbitrary code, subject to the same register constraints.
+ * This generally means the code is not free to clobber any
+ * registers other than EAX. The patch function should return
+ * the number of bytes of code generated, as we nop pad the
+ * rest in generic code.
*/
unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
unsigned long addr, unsigned len);
@@ -55,29 +50,29 @@ struct paravirt_ops
char *(*memory_setup)(void);
void (*post_allocator_init)(void);
- void (*init_IRQ)(void);
- void (*time_init)(void);
-
- /*
- * Called before/after init_mm pagetable setup. setup_start
- * may reset %cr3, and may pre-install parts of the pagetable;
- * pagetable setup is expected to preserve any existing
- * mapping.
- */
- void (*pagetable_setup_start)(pgd_t *pgd_base);
- void (*pagetable_setup_done)(pgd_t *pgd_base);
-
/* Print a banner to identify the environment */
void (*banner)(void);
+};
+
+
+struct pv_lazy_ops {
+ /* Set deferred update mode, used for batching operations. */
+ void (*enter)(void);
+ void (*leave)(void);
+};
+
+struct pv_time_ops {
+ void (*time_init)(void);
/* Set and set time of day */
unsigned long (*get_wallclock)(void);
int (*set_wallclock)(unsigned long);
- /* cpuid emulation, mostly so that caps bits can be disabled */
- void (*cpuid)(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx);
+ unsigned long long (*sched_clock)(void);
+ unsigned long (*get_cpu_khz)(void);
+};
+struct pv_cpu_ops {
/* hooks for various privileged instructions */
unsigned long (*get_debugreg)(int regno);
void (*set_debugreg)(int regno, unsigned long value);
@@ -87,41 +82,10 @@ struct paravirt_ops
unsigned long (*read_cr0)(void);
void (*write_cr0)(unsigned long);
- unsigned long (*read_cr2)(void);
- void (*write_cr2)(unsigned long);
-
- unsigned long (*read_cr3)(void);
- void (*write_cr3)(unsigned long);
-
unsigned long (*read_cr4_safe)(void);
unsigned long (*read_cr4)(void);
void (*write_cr4)(unsigned long);
- /*
- * Get/set interrupt state. save_fl and restore_fl are only
- * expected to use X86_EFLAGS_IF; all other bits
- * returned from save_fl are undefined, and may be ignored by
- * restore_fl.
- */
- unsigned long (*save_fl)(void);
- void (*restore_fl)(unsigned long);
- void (*irq_disable)(void);
- void (*irq_enable)(void);
- void (*safe_halt)(void);
- void (*halt)(void);
-
- void (*wbinvd)(void);
-
- /* MSR, PMC and TSR operations.
- err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
- u64 (*read_msr)(unsigned int msr, int *err);
- int (*write_msr)(unsigned int msr, u64 val);
-
- u64 (*read_tsc)(void);
- u64 (*read_pmc)(void);
- unsigned long long (*sched_clock)(void);
- unsigned long (*get_cpu_khz)(void);
-
/* Segment descriptor handling */
void (*load_tr_desc)(void);
void (*load_gdt)(const struct Xgt_desc_struct *);
@@ -140,18 +104,47 @@ struct paravirt_ops
void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t);
void (*set_iopl_mask)(unsigned mask);
+
+ void (*wbinvd)(void);
void (*io_delay)(void);
+ /* cpuid emulation, mostly so that caps bits can be disabled */
+ void (*cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ /* MSR, PMC and TSR operations.
+ err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (*read_msr)(unsigned int msr, int *err);
+ int (*write_msr)(unsigned int msr, u64 val);
+
+ u64 (*read_tsc)(void);
+ u64 (*read_pmc)(void);
+
+ /* These two are jmp to, not actually called. */
+ void (*irq_enable_sysexit)(void);
+ void (*iret)(void);
+
+ struct pv_lazy_ops lazy_mode;
+};
+
+struct pv_irq_ops {
+ void (*init_IRQ)(void);
+
/*
- * Hooks for intercepting the creation/use/destruction of an
- * mm_struct.
+ * Get/set interrupt state. save_fl and restore_fl are only
+ * expected to use X86_EFLAGS_IF; all other bits
+ * returned from save_fl are undefined, and may be ignored by
+ * restore_fl.
*/
- void (*activate_mm)(struct mm_struct *prev,
- struct mm_struct *next);
- void (*dup_mmap)(struct mm_struct *oldmm,
- struct mm_struct *mm);
- void (*exit_mmap)(struct mm_struct *mm);
+ unsigned long (*save_fl)(void);
+ void (*restore_fl)(unsigned long);
+ void (*irq_disable)(void);
+ void (*irq_enable)(void);
+ void (*safe_halt)(void);
+ void (*halt)(void);
+};
+struct pv_apic_ops {
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Direct APIC operations, principally for VMI. Ideally
@@ -167,6 +160,34 @@ struct paravirt_ops
unsigned long start_eip,
unsigned long start_esp);
#endif
+};
+
+struct pv_mmu_ops {
+ /*
+ * Called before/after init_mm pagetable setup. setup_start
+ * may reset %cr3, and may pre-install parts of the pagetable;
+ * pagetable setup is expected to preserve any existing
+ * mapping.
+ */
+ void (*pagetable_setup_start)(pgd_t *pgd_base);
+ void (*pagetable_setup_done)(pgd_t *pgd_base);
+
+ unsigned long (*read_cr2)(void);
+ void (*write_cr2)(unsigned long);
+
+ unsigned long (*read_cr3)(void);
+ void (*write_cr3)(unsigned long);
+
+ /*
+ * Hooks for intercepting the creation/use/destruction of an
+ * mm_struct.
+ */
+ void (*activate_mm)(struct mm_struct *prev,
+ struct mm_struct *next);
+ void (*dup_mmap)(struct mm_struct *oldmm,
+ struct mm_struct *mm);
+ void (*exit_mmap)(struct mm_struct *mm);
+
/* TLB operations */
void (*flush_tlb_user)(void);
@@ -191,15 +212,12 @@ struct paravirt_ops
void (*pte_update_defer)(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
-#ifdef CONFIG_HIGHPTE
- void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
-#endif
-
#ifdef CONFIG_X86_PAE
void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
- void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
+ void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
void (*set_pud)(pud_t *pudp, pud_t pudval);
- void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+ void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void (*pmd_clear)(pmd_t *pmdp);
unsigned long long (*pte_val)(pte_t);
@@ -217,21 +235,40 @@ struct paravirt_ops
pgd_t (*make_pgd)(unsigned long pgd);
#endif
- /* Set deferred update mode, used for batching operations. */
- void (*set_lazy_mode)(enum paravirt_lazy_mode mode);
+#ifdef CONFIG_HIGHPTE
+ void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
+#endif
- /* These two are jmp to, not actually called. */
- void (*irq_enable_sysexit)(void);
- void (*iret)(void);
+ struct pv_lazy_ops lazy_mode;
};
-extern struct paravirt_ops paravirt_ops;
+/* This contains all the paravirt structures: we get a convenient
+ * number for each function using the offset which we use to indicate
+ * what to patch. */
+struct paravirt_patch_template
+{
+ struct pv_init_ops pv_init_ops;
+ struct pv_time_ops pv_time_ops;
+ struct pv_cpu_ops pv_cpu_ops;
+ struct pv_irq_ops pv_irq_ops;
+ struct pv_apic_ops pv_apic_ops;
+ struct pv_mmu_ops pv_mmu_ops;
+};
+
+extern struct pv_info pv_info;
+extern struct pv_init_ops pv_init_ops;
+extern struct pv_time_ops pv_time_ops;
+extern struct pv_cpu_ops pv_cpu_ops;
+extern struct pv_irq_ops pv_irq_ops;
+extern struct pv_apic_ops pv_apic_ops;
+extern struct pv_mmu_ops pv_mmu_ops;
#define PARAVIRT_PATCH(x) \
- (offsetof(struct paravirt_ops, x) / sizeof(void *))
+ (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
-#define paravirt_type(type) \
- [paravirt_typenum] "i" (PARAVIRT_PATCH(type))
+#define paravirt_type(op) \
+ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
+ [paravirt_opptr] "m" (op)
#define paravirt_clobber(clobber) \
[paravirt_clobber] "i" (clobber)
@@ -258,7 +295,7 @@ unsigned paravirt_patch_call(void *insnbuf,
const void *target, u16 tgt_clobbers,
unsigned long addr, u16 site_clobbers,
unsigned len);
-unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
unsigned long addr, unsigned len);
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
unsigned long addr, unsigned len);
@@ -271,14 +308,14 @@ int paravirt_disable_iospace(void);
/*
* This generates an indirect call based on the operation type number.
* The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_ops structure, and can therefore be freely
- * converted back into a structure offset.
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
*/
-#define PARAVIRT_CALL "call *(paravirt_ops+%c[paravirt_typenum]*4);"
+#define PARAVIRT_CALL "call *%[paravirt_opptr];"
/*
- * These macros are intended to wrap calls into a paravirt_ops
- * operation, so that they can be later identified and patched at
+ * These macros are intended to wrap calls through one of the paravirt
+ * ops structs, so that they can be later identified and patched at
* runtime.
*
* Normally, a call to a pv_op function is a simple indirect call:
@@ -301,7 +338,7 @@ int paravirt_disable_iospace(void);
* The call instruction itself is marked by placing its start address
* and size into the .parainstructions section, so that
* apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend paravirt_ops
+ * appropriate patching under the control of the backend pv_init_ops
* implementation.
*
* Unfortunately there's no way to get gcc to generate the args setup
@@ -409,36 +446,36 @@ int paravirt_disable_iospace(void);
static inline int paravirt_enabled(void)
{
- return paravirt_ops.paravirt_enabled;
+ return pv_info.paravirt_enabled;
}
static inline void load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
{
- PVOP_VCALL2(load_esp0, tss, thread);
+ PVOP_VCALL2(pv_cpu_ops.load_esp0, tss, thread);
}
-#define ARCH_SETUP paravirt_ops.arch_setup();
+#define ARCH_SETUP pv_init_ops.arch_setup();
static inline unsigned long get_wallclock(void)
{
- return PVOP_CALL0(unsigned long, get_wallclock);
+ return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
}
static inline int set_wallclock(unsigned long nowtime)
{
- return PVOP_CALL1(int, set_wallclock, nowtime);
+ return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
}
static inline void (*choose_time_init(void))(void)
{
- return paravirt_ops.time_init;
+ return pv_time_ops.time_init;
}
/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
- PVOP_VCALL4(cpuid, eax, ebx, ecx, edx);
+ PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
}
/*
@@ -446,87 +483,87 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
*/
static inline unsigned long paravirt_get_debugreg(int reg)
{
- return PVOP_CALL1(unsigned long, get_debugreg, reg);
+ return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static inline void set_debugreg(unsigned long val, int reg)
{
- PVOP_VCALL2(set_debugreg, reg, val);
+ PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
}
static inline void clts(void)
{
- PVOP_VCALL0(clts);
+ PVOP_VCALL0(pv_cpu_ops.clts);
}
static inline unsigned long read_cr0(void)
{
- return PVOP_CALL0(unsigned long, read_cr0);
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
}
static inline void write_cr0(unsigned long x)
{
- PVOP_VCALL1(write_cr0, x);
+ PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
}
static inline unsigned long read_cr2(void)
{
- return PVOP_CALL0(unsigned long, read_cr2);
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
}
static inline void write_cr2(unsigned long x)
{
- PVOP_VCALL1(write_cr2, x);
+ PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
}
static inline unsigned long read_cr3(void)
{
- return PVOP_CALL0(unsigned long, read_cr3);
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
}
static inline void write_cr3(unsigned long x)
{
- PVOP_VCALL1(write_cr3, x);
+ PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}
static inline unsigned long read_cr4(void)
{
- return PVOP_CALL0(unsigned long, read_cr4);
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
static inline unsigned long read_cr4_safe(void)
{
- return PVOP_CALL0(unsigned long, read_cr4_safe);
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
}
static inline void write_cr4(unsigned long x)
{
- PVOP_VCALL1(write_cr4, x);
+ PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
}
static inline void raw_safe_halt(void)
{
- PVOP_VCALL0(safe_halt);
+ PVOP_VCALL0(pv_irq_ops.safe_halt);
}
static inline void halt(void)
{
- PVOP_VCALL0(safe_halt);
+ PVOP_VCALL0(pv_irq_ops.safe_halt);
}
static inline void wbinvd(void)
{
- PVOP_VCALL0(wbinvd);
+ PVOP_VCALL0(pv_cpu_ops.wbinvd);
}
-#define get_kernel_rpl() (paravirt_ops.kernel_rpl)
+#define get_kernel_rpl() (pv_info.kernel_rpl)
static inline u64 paravirt_read_msr(unsigned msr, int *err)
{
- return PVOP_CALL2(u64, read_msr, msr, err);
+ return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
}
static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
{
- return PVOP_CALL3(int, write_msr, msr, low, high);
+ return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
}
/* These should all do BUG_ON(_err), but our headers are too tangled. */
@@ -560,7 +597,7 @@ static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
static inline u64 paravirt_read_tsc(void)
{
- return PVOP_CALL0(u64, read_tsc);
+ return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
}
#define rdtscl(low) do { \
@@ -572,15 +609,15 @@ static inline u64 paravirt_read_tsc(void)
static inline unsigned long long paravirt_sched_clock(void)
{
- return PVOP_CALL0(unsigned long long, sched_clock);
+ return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
}
-#define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
+#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
static inline unsigned long long paravirt_read_pmc(int counter)
{
- return PVOP_CALL1(u64, read_pmc, counter);
+ return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
}
#define rdpmc(counter,low,high) do { \
@@ -591,61 +628,61 @@ static inline unsigned long long paravirt_read_pmc(int counter)
static inline void load_TR_desc(void)
{
- PVOP_VCALL0(load_tr_desc);
+ PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
}
static inline void load_gdt(const struct Xgt_desc_struct *dtr)
{
- PVOP_VCALL1(load_gdt, dtr);
+ PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
}
static inline void load_idt(const struct Xgt_desc_struct *dtr)
{
- PVOP_VCALL1(load_idt, dtr);
+ PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
}
static inline void set_ldt(const void *addr, unsigned entries)
{
- PVOP_VCALL2(set_ldt, addr, entries);
+ PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
}
static inline void store_gdt(struct Xgt_desc_struct *dtr)
{
- PVOP_VCALL1(store_gdt, dtr);
+ PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
}
static inline void store_idt(struct Xgt_desc_struct *dtr)
{
- PVOP_VCALL1(store_idt, dtr);
+ PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
}
static inline unsigned long paravirt_store_tr(void)
{
- return PVOP_CALL0(unsigned long, store_tr);
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
}
#define store_tr(tr) ((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
- PVOP_VCALL2(load_tls, t, cpu);
+ PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
}
static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
{
- PVOP_VCALL4(write_ldt_entry, dt, entry, low, high);
+ PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
}
static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high)
{
- PVOP_VCALL4(write_gdt_entry, dt, entry, low, high);
+ PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high);
}
static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high)
{
- PVOP_VCALL4(write_idt_entry, dt, entry, low, high);
+ PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high);
}
static inline void set_iopl_mask(unsigned mask)
{
- PVOP_VCALL1(set_iopl_mask, mask);
+ PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
}
/* The paravirtualized I/O functions */
static inline void slow_down_io(void) {
- paravirt_ops.io_delay();
+ pv_cpu_ops.io_delay();
#ifdef REALLY_SLOW_IO
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
#endif
}
@@ -655,121 +692,120 @@ static inline void slow_down_io(void) {
*/
static inline void apic_write(unsigned long reg, unsigned long v)
{
- PVOP_VCALL2(apic_write, reg, v);
+ PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
}
static inline void apic_write_atomic(unsigned long reg, unsigned long v)
{
- PVOP_VCALL2(apic_write_atomic, reg, v);
+ PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
}
static inline unsigned long apic_read(unsigned long reg)
{
- return PVOP_CALL1(unsigned long, apic_read, reg);
+ return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
}
static inline void setup_boot_clock(void)
{
- PVOP_VCALL0(setup_boot_clock);
+ PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
}
static inline void setup_secondary_clock(void)
{
- PVOP_VCALL0(setup_secondary_clock);
+ PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
}
#endif
static inline void paravirt_post_allocator_init(void)
{
- if (paravirt_ops.post_allocator_init)
- (*paravirt_ops.post_allocator_init)();
+ if (pv_init_ops.post_allocator_init)
+ (*pv_init_ops.post_allocator_init)();
}
static inline void paravirt_pagetable_setup_start(pgd_t *base)
{
- if (paravirt_ops.pagetable_setup_start)
- (*paravirt_ops.pagetable_setup_start)(base);
+ (*pv_mmu_ops.pagetable_setup_start)(base);
}
static inline void paravirt_pagetable_setup_done(pgd_t *base)
{
- if (paravirt_ops.pagetable_setup_done)
- (*paravirt_ops.pagetable_setup_done)(base);
+ (*pv_mmu_ops.pagetable_setup_done)(base);
}
#ifdef CONFIG_SMP
static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
unsigned long start_esp)
{
- PVOP_VCALL3(startup_ipi_hook, phys_apicid, start_eip, start_esp);
+ PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
+ phys_apicid, start_eip, start_esp);
}
#endif
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
- PVOP_VCALL2(activate_mm, prev, next);
+ PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
}
static inline void arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
- PVOP_VCALL2(dup_mmap, oldmm, mm);
+ PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
}
static inline void arch_exit_mmap(struct mm_struct *mm)
{
- PVOP_VCALL1(exit_mmap, mm);
+ PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
}
static inline void __flush_tlb(void)
{
- PVOP_VCALL0(flush_tlb_user);
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
}
static inline void __flush_tlb_global(void)
{
- PVOP_VCALL0(flush_tlb_kernel);
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
}
static inline void __flush_tlb_single(unsigned long addr)
{
- PVOP_VCALL1(flush_tlb_single, addr);
+ PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
}
static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
unsigned long va)
{
- PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va);
+ PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
}
static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
{
- PVOP_VCALL2(alloc_pt, mm, pfn);
+ PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
}
static inline void paravirt_release_pt(unsigned pfn)
{
- PVOP_VCALL1(release_pt, pfn);
+ PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
}
static inline void paravirt_alloc_pd(unsigned pfn)
{
- PVOP_VCALL1(alloc_pd, pfn);
+ PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
}
static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
unsigned start, unsigned count)
{
- PVOP_VCALL4(alloc_pd_clone, pfn, clonepfn, start, count);
+ PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
}
static inline void paravirt_release_pd(unsigned pfn)
{
- PVOP_VCALL1(release_pd, pfn);
+ PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
}
#ifdef CONFIG_HIGHPTE
static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
{
unsigned long ret;
- ret = PVOP_CALL2(unsigned long, kmap_atomic_pte, page, type);
+ ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
return (void *)ret;
}
#endif
@@ -777,162 +813,191 @@ static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- PVOP_VCALL3(pte_update, mm, addr, ptep);
+ PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
}
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- PVOP_VCALL3(pte_update_defer, mm, addr, ptep);
+ PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
}
#ifdef CONFIG_X86_PAE
static inline pte_t __pte(unsigned long long val)
{
- unsigned long long ret = PVOP_CALL2(unsigned long long, make_pte,
+ unsigned long long ret = PVOP_CALL2(unsigned long long,
+ pv_mmu_ops.make_pte,
val, val >> 32);
return (pte_t) { ret, ret >> 32 };
}
static inline pmd_t __pmd(unsigned long long val)
{
- return (pmd_t) { PVOP_CALL2(unsigned long long, make_pmd, val, val >> 32) };
+ return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd,
+ val, val >> 32) };
}
static inline pgd_t __pgd(unsigned long long val)
{
- return (pgd_t) { PVOP_CALL2(unsigned long long, make_pgd, val, val >> 32) };
+ return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd,
+ val, val >> 32) };
}
static inline unsigned long long pte_val(pte_t x)
{
- return PVOP_CALL2(unsigned long long, pte_val, x.pte_low, x.pte_high);
+ return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val,
+ x.pte_low, x.pte_high);
}
static inline unsigned long long pmd_val(pmd_t x)
{
- return PVOP_CALL2(unsigned long long, pmd_val, x.pmd, x.pmd >> 32);
+ return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val,
+ x.pmd, x.pmd >> 32);
}
static inline unsigned long long pgd_val(pgd_t x)
{
- return PVOP_CALL2(unsigned long long, pgd_val, x.pgd, x.pgd >> 32);
+ return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val,
+ x.pgd, x.pgd >> 32);
}
static inline void set_pte(pte_t *ptep, pte_t pteval)
{
- PVOP_VCALL3(set_pte, ptep, pteval.pte_low, pteval.pte_high);
+ PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high);
}
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
/* 5 arg words */
- paravirt_ops.set_pte_at(mm, addr, ptep, pteval);
+ pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval);
}
static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
{
- PVOP_VCALL3(set_pte_atomic, ptep, pteval.pte_low, pteval.pte_high);
+ PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
+ pteval.pte_low, pteval.pte_high);
}
static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
/* 5 arg words */
- paravirt_ops.set_pte_present(mm, addr, ptep, pte);
+ pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
}
static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
- PVOP_VCALL3(set_pmd, pmdp, pmdval.pmd, pmdval.pmd >> 32);
+ PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp,
+ pmdval.pmd, pmdval.pmd >> 32);
}
static inline void set_pud(pud_t *pudp, pud_t pudval)
{
- PVOP_VCALL3(set_pud, pudp, pudval.pgd.pgd, pudval.pgd.pgd >> 32);
+ PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
+ pudval.pgd.pgd, pudval.pgd.pgd >> 32);
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- PVOP_VCALL3(pte_clear, mm, addr, ptep);
+ PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
}
static inline void pmd_clear(pmd_t *pmdp)
{
- PVOP_VCALL1(pmd_clear, pmdp);
+ PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
}
#else /* !CONFIG_X86_PAE */
static inline pte_t __pte(unsigned long val)
{
- return (pte_t) { PVOP_CALL1(unsigned long, make_pte, val) };
+ return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) };
}
static inline pgd_t __pgd(unsigned long val)
{
- return (pgd_t) { PVOP_CALL1(unsigned long, make_pgd, val) };
+ return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) };
}
static inline unsigned long pte_val(pte_t x)
{
- return PVOP_CALL1(unsigned long, pte_val, x.pte_low);
+ return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low);
}
static inline unsigned long pgd_val(pgd_t x)
{
- return PVOP_CALL1(unsigned long, pgd_val, x.pgd);
+ return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd);
}
static inline void set_pte(pte_t *ptep, pte_t pteval)
{
- PVOP_VCALL2(set_pte, ptep, pteval.pte_low);
+ PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low);
}
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- PVOP_VCALL4(set_pte_at, mm, addr, ptep, pteval.pte_low);
+ PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low);
}
static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
- PVOP_VCALL2(set_pmd, pmdp, pmdval.pud.pgd.pgd);
+ PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd);
}
#endif /* CONFIG_X86_PAE */
+/* Lazy mode for batching updates / context switch */
+enum paravirt_lazy_mode {
+ PARAVIRT_LAZY_NONE,
+ PARAVIRT_LAZY_MMU,
+ PARAVIRT_LAZY_CPU,
+};
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
+void paravirt_enter_lazy_cpu(void);
+void paravirt_leave_lazy_cpu(void);
+void paravirt_enter_lazy_mmu(void);
+void paravirt_leave_lazy_mmu(void);
+void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
+
#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
static inline void arch_enter_lazy_cpu_mode(void)
{
- PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_CPU);
+ PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
}
static inline void arch_leave_lazy_cpu_mode(void)
{
- PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE);
+ PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
}
static inline void arch_flush_lazy_cpu_mode(void)
{
- PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH);
+ if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
+ arch_leave_lazy_cpu_mode();
+ arch_enter_lazy_cpu_mode();
+ }
}
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
- PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_MMU);
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
}
static inline void arch_leave_lazy_mmu_mode(void)
{
- PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE);
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
}
static inline void arch_flush_lazy_mmu_mode(void)
{
- PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH);
+ if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
}
void _paravirt_nop(void);
@@ -957,7 +1022,7 @@ static inline unsigned long __raw_local_save_flags(void)
PARAVIRT_CALL
"popl %%edx; popl %%ecx")
: "=a"(f)
- : paravirt_type(save_fl),
+ : paravirt_type(pv_irq_ops.save_fl),
paravirt_clobber(CLBR_EAX)
: "memory", "cc");
return f;
@@ -970,7 +1035,7 @@ static inline void raw_local_irq_restore(unsigned long f)
"popl %%edx; popl %%ecx")
: "=a"(f)
: "0"(f),
- paravirt_type(restore_fl),
+ paravirt_type(pv_irq_ops.restore_fl),
paravirt_clobber(CLBR_EAX)
: "memory", "cc");
}
@@ -981,7 +1046,7 @@ static inline void raw_local_irq_disable(void)
PARAVIRT_CALL
"popl %%edx; popl %%ecx")
:
- : paravirt_type(irq_disable),
+ : paravirt_type(pv_irq_ops.irq_disable),
paravirt_clobber(CLBR_EAX)
: "memory", "eax", "cc");
}
@@ -992,7 +1057,7 @@ static inline void raw_local_irq_enable(void)
PARAVIRT_CALL
"popl %%edx; popl %%ecx")
:
- : paravirt_type(irq_enable),
+ : paravirt_type(pv_irq_ops.irq_enable),
paravirt_clobber(CLBR_EAX)
: "memory", "eax", "cc");
}
@@ -1008,21 +1073,23 @@ static inline unsigned long __raw_local_irq_save(void)
#define CLI_STRING \
_paravirt_alt("pushl %%ecx; pushl %%edx;" \
- "call *paravirt_ops+%c[paravirt_cli_type]*4;" \
+ "call *%[paravirt_cli_opptr];" \
"popl %%edx; popl %%ecx", \
"%c[paravirt_cli_type]", "%c[paravirt_clobber]")
#define STI_STRING \
_paravirt_alt("pushl %%ecx; pushl %%edx;" \
- "call *paravirt_ops+%c[paravirt_sti_type]*4;" \
+ "call *%[paravirt_sti_opptr];" \
"popl %%edx; popl %%ecx", \
"%c[paravirt_sti_type]", "%c[paravirt_clobber]")
#define CLI_STI_CLOBBERS , "%eax"
#define CLI_STI_INPUT_ARGS \
, \
- [paravirt_cli_type] "i" (PARAVIRT_PATCH(irq_disable)), \
- [paravirt_sti_type] "i" (PARAVIRT_PATCH(irq_enable)), \
+ [paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)), \
+ [paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable), \
+ [paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)), \
+ [paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable), \
paravirt_clobber(CLBR_EAX)
/* Make sure as little as possible of this mess escapes. */
@@ -1042,7 +1109,7 @@ static inline unsigned long __raw_local_irq_save(void)
#else /* __ASSEMBLY__ */
-#define PARA_PATCH(off) ((off) / 4)
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
#define PARA_SITE(ptype, clobbers, ops) \
771:; \
@@ -1055,29 +1122,29 @@ static inline unsigned long __raw_local_irq_save(void)
.short clobbers; \
.popsection
-#define INTERRUPT_RETURN \
- PARA_SITE(PARA_PATCH(PARAVIRT_iret), CLBR_NONE, \
- jmp *%cs:paravirt_ops+PARAVIRT_iret)
+#define INTERRUPT_RETURN \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
+ jmp *%cs:pv_cpu_ops+PV_CPU_iret)
#define DISABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PARAVIRT_irq_disable), clobbers, \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
pushl %eax; pushl %ecx; pushl %edx; \
- call *%cs:paravirt_ops+PARAVIRT_irq_disable; \
+ call *%cs:pv_irq_ops+PV_IRQ_irq_disable; \
popl %edx; popl %ecx; popl %eax) \
#define ENABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable), clobbers, \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
pushl %eax; pushl %ecx; pushl %edx; \
- call *%cs:paravirt_ops+PARAVIRT_irq_enable; \
+ call *%cs:pv_irq_ops+PV_IRQ_irq_enable; \
popl %edx; popl %ecx; popl %eax)
-#define ENABLE_INTERRUPTS_SYSEXIT \
- PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable_sysexit), CLBR_NONE, \
- jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)
+#define ENABLE_INTERRUPTS_SYSEXIT \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\
+ jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit)
#define GET_CR0_INTO_EAX \
push %ecx; push %edx; \
- call *paravirt_ops+PARAVIRT_read_cr0; \
+ call *pv_cpu_ops+PV_CPU_read_cr0; \
pop %edx; pop %ecx
#endif /* __ASSEMBLY__ */
diff --git a/include/asm-x86/pgtable-3level-defs.h b/include/asm-x86/pgtable-3level-defs.h
index c0df89f66e8b..448ac9516314 100644
--- a/include/asm-x86/pgtable-3level-defs.h
+++ b/include/asm-x86/pgtable-3level-defs.h
@@ -2,7 +2,7 @@
#define _I386_PGTABLE_3LEVEL_DEFS_H
#ifdef CONFIG_PARAVIRT
-#define SHARED_KERNEL_PMD (paravirt_ops.shared_kernel_pmd)
+#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
#else
#define SHARED_KERNEL_PMD 1
#endif
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index ff61ea365997..b05d8a6d9143 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -160,8 +160,9 @@ struct vcpu_set_singleshot_timer {
*/
#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */
struct vcpu_register_vcpu_info {
- uint32_t mfn; /* mfn of page to place vcpu_info */
- uint32_t offset; /* offset within page */
+ uint64_t mfn; /* mfn of page to place vcpu_info */
+ uint32_t offset; /* offset within page */
+ uint32_t rsvd; /* unused */
};
#endif /* __XEN_PUBLIC_VCPU_H__ */
diff --git a/mm/Kconfig b/mm/Kconfig
index 1cc6cada2bbf..b1f03b0eb7f1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -155,7 +155,6 @@ config SPLIT_PTLOCK_CPUS
int
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20
- default "4096" if XEN
default "4"
#