From f6e9456c9272bb570df6e217cdbe007e270b1c4e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:58 +0200 Subject: x86, cleanup: Remove obsolete boot_cpu_id variable boot_cpu_id is there for historical reasons and was renamed to boot_cpu_physical_apicid in patch: c70dcb7 x86: change boot_cpu_id to boot_cpu_physical_apicid However, there are some remaining occurrences of boot_cpu_id that are never touched in the kernel and thus its value is always 0. This patch removes boot_cpu_id completely. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-8-git-send-email-robert.richter@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/apb_timer.h | 1 - arch/x86/include/asm/cpu.h | 1 - arch/x86/kernel/apb_timer.c | 2 +- arch/x86/kernel/apic/io_apic.c | 12 ++++++------ arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/mm/k8topology_64.c | 6 +++--- 11 files changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index a69b1ac9eaf8..2fefa501d3ba 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event; extern unsigned long apbt_quick_calibrate(void); extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); -extern unsigned int boot_cpu_id; extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index b185091bf19c..4fab24de26b1 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int); DECLARE_PER_CPU(int, cpu_state); -extern unsigned int boot_cpu_id; #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 8dd77800ff5d..08f75fb4f509 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -343,7 +343,7 @@ void apbt_setup_secondary_clock(void) /* Don't register boot CPU clockevent */ cpu = smp_processor_id(); - if (cpu == boot_cpu_id) + if (!cpu) return; /* * We need to calculate the scaled math multiplication factor for diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4dc0084ec1b1..8884928d7bc1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -162,7 +162,7 @@ int __init arch_early_irq_init(void) cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); - node= cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); for (i = 0; i < count; i++) { desc = irq_to_desc(i); @@ -1483,7 +1483,7 @@ static void __init setup_IO_APIC_irqs(void) int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1548,7 +1548,7 @@ static void __init setup_IO_APIC_irqs(void) void setup_IO_APIC_irq_extra(u32 gsi) { int apic_id = 0, pin, idx, irq; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); struct irq_desc *desc; struct irq_cfg *cfg; @@ -2925,7 +2925,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -3279,7 +3279,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) int create_irq(void) { - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); unsigned int irq_want; int irq; @@ -3901,7 +3901,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, if (dev) node = dev_to_node(dev); else - node = cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); desc = irq_to_desc_alloc_node(irq, node); if (!desc) { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 60a57b13082d..3a7c852f021d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ - if (c->cpu_index == boot_cpu_id) + if (!c->cpu_index) return; /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 490dac63c2d2..787b3c7c6625 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_early_init(c); #ifdef CONFIG_SMP - c->cpu_index = boot_cpu_id; + c->cpu_index = 0; #endif filter_cpuid_features(c, false); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 85f69cdeae10..3a683ea5267e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -169,7 +169,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ - if (c->cpu_index == boot_cpu_id) + if (!c->cpu_index) return; /* diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e3af342fe83a..7a4cf14223ba 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -84,7 +84,7 @@ static int __init reboot_setup(char *str) } /* we will leave sorting out the final value when we are ready to reboot, since we might not - have set up boot_cpu_id or smp_num_cpu */ + have detected BSP APIC ID or smp_num_cpu */ break; #endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b008e7883207..dede5c4bae4b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -125,7 +125,6 @@ unsigned long max_pfn_mapped; RESERVE_BRK(dmi_alloc, 65536); #endif -unsigned int boot_cpu_id __read_mostly; static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a60df9ae6454..2335c15c93a4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void) * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. */ - if (cpu == boot_cpu_id) + if (!cpu) switch_to_new_gdt(cpu); } diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 970ed579d4e4..240f86462a83 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -54,8 +54,8 @@ static __init int find_northbridge(void) static __init void early_get_boot_cpu_id(void) { /* - * need to get boot_cpu_id so can use that to create apicid_to_node - * in k8_scan_nodes() + * need to get the APIC ID of the BSP so can use that to + * create apicid_to_node in k8_scan_nodes() */ #ifdef CONFIG_X86_MPPARSE /* @@ -212,7 +212,7 @@ int __init k8_scan_nodes(void) bits = boot_cpu_data.x86_coreid_bits; cores = (1< 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); -- cgit v1.2.3 From 1f999ab5a1360afc388868cc0ef9afe8edeef3be Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:57 +0200 Subject: x86, xsave: Disable xsave in i387 emulation mode xsave is broken for (!HAVE_HWFP). This is the case if config MATH_EMULATION is enabled, 'no387' kernel parameter is set and xsave exists. xsave will not work because x86/math-emu and xsave share the same memory. As this case can be treated as corner case we simply disable xsave then. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-7-git-send-email-robert.richter@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 1f11f5ce668f..2605c50b11d3 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -67,6 +67,12 @@ static void __cpuinit init_thread_xstate(void) */ if (!HAVE_HWFP) { + /* + * Disable xsave as we do not support it if i387 + * emulation is enabled. + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); xstate_size = sizeof(struct i387_soft_struct); return; } -- cgit v1.2.3 From 61c77326d1df079f202fa79403c3ccd8c5966a81 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 16 Aug 2010 09:16:55 +0800 Subject: x86, mm: Avoid unnecessary TLB flush In x86, access and dirty bits are set automatically by CPU when CPU accesses memory. When we go into the code path of below flush_tlb_fix_spurious_fault(), we already set dirty bit for pte and don't need flush tlb. This might mean tlb entry in some CPUs hasn't dirty bit set, but this doesn't matter. When the CPUs do page write, they will automatically check the bit and no software involved. On the other hand, flush tlb in below position is harmful. Test creates CPU number of threads, each thread writes to a same but random address in same vma range and we measure the total time. Under a 4 socket system, original time is 1.96s, while with the patch, the time is 0.8s. Under a 2 socket system, there is 20% time cut too. perf shows a lot of time are taking to send ipi/handle ipi for tlb flush. Signed-off-by: Shaohua Li LKML-Reference: <20100816011655.GA362@sli10-desk.sh.intel.com> Acked-by: Suresh Siddha Cc: Andrea Archangeli Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a34c785c5a63..2d0a33bd2971 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -603,6 +603,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep); } +#define flush_tlb_fix_spurious_fault(vma, address) + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * -- cgit v1.2.3 From fdf4289679fd41d76553ce224750e9737cd80eea Mon Sep 17 00:00:00 2001 From: "Ma, Ling" Date: Mon, 23 Aug 2010 14:11:12 -0700 Subject: x86, mem: Don't implement forward memmove() as memcpy() memmove() allow source and destination address to be overlap, but there is no such limitation for memcpy(). Therefore, explicitly implement memmove() in both the forwards and backward directions, to give us the ability to optimize memcpy(). Signed-off-by: Ma Ling LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/lib/memcpy_32.c | 38 +++++++++++++++++++++++++++----------- arch/x86/lib/memmove_64.c | 46 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 68 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 5415a9d06f53..be424dfcf365 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -25,19 +25,35 @@ void *memmove(void *dest, const void *src, size_t n) int d0, d1, d2; if (dest < src) { - memcpy(dest, src, n); + if ((dest + n) < src) + return memcpy(dest, src, n); + else + __asm__ __volatile__( + "rep\n\t" + "movsb\n\t" + : "=&c" (d0), "=&S" (d1), "=&D" (d2) + :"0" (n), + "1" (src), + "2" (dest) + :"memory"); + } else { - __asm__ __volatile__( - "std\n\t" - "rep\n\t" - "movsb\n\t" - "cld" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (n-1+src), - "2" (n-1+dest) - :"memory"); + + if((src + count) < dest) + return memcpy(dest, src, count); + else + __asm__ __volatile__( + "std\n\t" + "rep\n\t" + "movsb\n\t" + "cld" + : "=&c" (d0), "=&S" (d1), "=&D" (d2) + :"0" (n), + "1" (n-1+src), + "2" (n-1+dest) + :"memory"); } + return dest; } EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 0a33909bf122..ecacc4b3d9e5 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -8,13 +8,49 @@ #undef memmove void *memmove(void *dest, const void *src, size_t count) { + unsigned long d0, d1, d2, d3; if (dest < src) { - return memcpy(dest, src, count); + if ((dest + count) < src) + return memcpy(dest, src, count); + else + __asm__ __volatile__( + "movq %0, %3\n\t" + "shr $3, %0\n\t" + "andq $7, %3\n\t" + "rep\n\t" + "movsq\n\t" + "movq %3, %0\n\t" + "rep\n\t" + "movsb" + : "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3) + :"0" (count), + "1" (src), + "2" (dest) + :"memory"); } else { - char *p = dest + count; - const char *s = src + count; - while (count--) - *--p = *--s; + if((src + count) < dest) + return memcpy(dest, src, count); + else + __asm__ __volatile__( + "movq %0, %3\n\t" + "lea -8(%1, %0), %1\n\t" + "lea -8(%2, %0), %2\n\t" + "shr $3, %0\n\t" + "andq $7, %3\n\t" + "std\n\t" + "rep\n\t" + "movsq\n\t" + "lea 7(%1), %1\n\t" + "lea 7(%2), %2\n\t" + "movq %3, %0\n\t" + "rep\n\t" + "movsb\n\t" + "cld" + : "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3) + :"0" (count), + "1" (src), + "2" (dest) + :"memory"); } return dest; } -- cgit v1.2.3 From 59daa706fbec745684702741b9f5373142dd9fdc Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Tue, 29 Jun 2010 03:24:25 +0800 Subject: x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/lib/memcpy_32.c | 6 +- arch/x86/lib/memcpy_64.S | 158 ++++++++++++++++++++++++++++++----------------- 2 files changed, 105 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index be424dfcf365..81130d477ee2 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n) "1" (src), "2" (dest) :"memory"); - } else { - - if((src + count) < dest) - return memcpy(dest, src, count); + if((src + n) < dest) + return memcpy(dest, src, n); else __asm__ __volatile__( "std\n\t" diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d5..75ef61e35e38 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -40,84 +40,132 @@ ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC + movq %rdi, %rax /* - * Put the number of full 64-byte blocks into %ecx. - * Tail portion is handled at the end: + * Use 32bit CMP here to avoid long NOP padding. */ - movq %rdi, %rax - movl %edx, %ecx - shrl $6, %ecx - jz .Lhandle_tail + cmp $0x20, %edx + jb .Lhandle_tail - .p2align 4 -.Lloop_64: /* - * We decrement the loop index here - and the zero-flag is - * checked at the end of the loop (instructions inbetween do - * not change the zero flag): + * We check whether memory false dependece could occur, + * then jump to corresponding copy mode. */ - decl %ecx + cmp %dil, %sil + jl .Lcopy_backward + subl $0x20, %edx +.Lcopy_forward_loop: + subq $0x20, %rdx /* - * Move in blocks of 4x16 bytes: + * Move in blocks of 4x8 bytes: */ - movq 0*8(%rsi), %r11 - movq 1*8(%rsi), %r8 - movq %r11, 0*8(%rdi) - movq %r8, 1*8(%rdi) - - movq 2*8(%rsi), %r9 - movq 3*8(%rsi), %r10 - movq %r9, 2*8(%rdi) - movq %r10, 3*8(%rdi) - - movq 4*8(%rsi), %r11 - movq 5*8(%rsi), %r8 - movq %r11, 4*8(%rdi) - movq %r8, 5*8(%rdi) - - movq 6*8(%rsi), %r9 - movq 7*8(%rsi), %r10 - movq %r9, 6*8(%rdi) - movq %r10, 7*8(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jnz .Lloop_64 + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq 2*8(%rsi), %r10 + movq 3*8(%rsi), %r11 + leaq 4*8(%rsi), %rsi + + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, 2*8(%rdi) + movq %r11, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae .Lcopy_forward_loop + addq $0x20, %rdx + jmp .Lhandle_tail + +.Lcopy_backward: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * At most 3 ALU operations in one cycle, + * so append NOPS in the same 16bytes trunk. + */ + .p2align 4 +.Lcopy_backward_loop: + subq $0x20, %rdx + movq -1*8(%rsi), %r8 + movq -2*8(%rsi), %r9 + movq -3*8(%rsi), %r10 + movq -4*8(%rsi), %r11 + leaq -4*8(%rsi), %rsi + movq %r8, -1*8(%rdi) + movq %r9, -2*8(%rdi) + movq %r10, -3*8(%rdi) + movq %r11, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae .Lcopy_backward_loop + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi .Lhandle_tail: - movl %edx, %ecx - andl $63, %ecx - shrl $3, %ecx - jz .Lhandle_7 + cmpq $16, %rdx + jb .Lless_16bytes + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq -2*8(%rsi, %rdx), %r10 + movq -1*8(%rsi, %rdx), %r11 + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, -2*8(%rdi, %rdx) + movq %r11, -1*8(%rdi, %rdx) + retq .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi), %r8 - movq %r8, (%rdi) - leaq 8(%rdi), %rdi - leaq 8(%rsi), %rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx, %ecx - andl $7, %ecx - jz .Lend +.Lless_16bytes: + cmpq $8, %rdx + jb .Lless_8bytes + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r8 + movq -1*8(%rsi, %rdx), %r9 + movq %r8, 0*8(%rdi) + movq %r9, -1*8(%rdi, %rdx) + retq + .p2align 4 +.Lless_8bytes: + cmpq $4, %rdx + jb .Lless_3bytes + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %ecx + movl -4(%rsi, %rdx), %r8d + movl %ecx, (%rdi) + movl %r8d, -4(%rdi, %rdx) + retq .p2align 4 +.Lless_3bytes: + cmpl $0, %edx + je .Lend + /* + * Move data from 1 bytes to 3 bytes. + */ .Lloop_1: movb (%rsi), %r8b movb %r8b, (%rdi) incq %rdi incq %rsi - decl %ecx + decl %edx jnz .Lloop_1 .Lend: - ret + retq CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) -- cgit v1.2.3 From 9863c90f682fba34cdc26c3437e8c00da6c83fa4 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 23 Aug 2010 14:49:11 -0700 Subject: x86, vmware: Remove deprecated VMI kernel support With the recent innovations in CPU hardware acceleration technologies from Intel and AMD, VMware ran a few experiments to compare these techniques to guest paravirtualization technique on VMware's platform. These hardware assisted virtualization techniques have outperformed the performance benefits provided by VMI in most of the workloads. VMware expects that these hardware features will be ubiquitous in a couple of years, as a result, VMware has started a phased retirement of this feature from the hypervisor. Please note that VMI has always been an optimization and non-VMI kernels still work fine on VMware's platform. Latest versions of VMware's product which support VMI are, Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence releases for these products will continue supporting VMI. For more details about VMI retirement take a look at this, http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html This feature removal was scheduled for 2.6.37 back in September 2009. Signed-off-by: Alok N Kataria LKML-Reference: <1282600151.19396.22.camel@ank32.eng.vmware.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 19 - arch/x86/include/asm/vmi.h | 269 ------------ arch/x86/include/asm/vmi_time.h | 98 ----- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/setup.c | 12 +- arch/x86/kernel/smpboot.c | 2 - arch/x86/kernel/vmi_32.c | 893 ---------------------------------------- arch/x86/kernel/vmiclock_32.c | 317 -------------- 8 files changed, 4 insertions(+), 1607 deletions(-) delete mode 100644 arch/x86/include/asm/vmi.h delete mode 100644 arch/x86/include/asm/vmi_time.h delete mode 100644 arch/x86/kernel/vmi_32.c delete mode 100644 arch/x86/kernel/vmiclock_32.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..f0ee331feeab 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -517,25 +517,6 @@ if PARAVIRT_GUEST source "arch/x86/xen/Kconfig" -config VMI - bool "VMI Guest support (DEPRECATED)" - select PARAVIRT - depends on X86_32 - ---help--- - VMI provides a paravirtualized interface to the VMware ESX server - (it could be used by other hypervisors in theory too, but is not - at the moment), by linking the kernel to a GPL-ed ROM module - provided by the hypervisor. - - As of September 2009, VMware has started a phased retirement - of this feature from VMware's products. Please see - feature-removal-schedule.txt for details. If you are - planning to enable this option, please note that you cannot - live migrate a VMI enabled VM to a future VMware product, - which doesn't support VMI. So if you expect your kernel to - seamlessly migrate to newer VMware products, keep this - disabled. - config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h deleted file mode 100644 index 61e08c0a2907..000000000000 --- a/arch/x86/include/asm/vmi.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * VMI interface definition - * - * Copyright (C) 2005, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Maintained by: Zachary Amsden zach@vmware.com - * - */ -#include - -/* - *--------------------------------------------------------------------- - * - * VMI Option ROM API - * - *--------------------------------------------------------------------- - */ -#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */ - -#define PCI_VENDOR_ID_VMWARE 0x15AD -#define PCI_DEVICE_ID_VMWARE_VMI 0x0801 - -/* - * We use two version numbers for compatibility, with the major - * number signifying interface breakages, and the minor number - * interface extensions. - */ -#define VMI_API_REV_MAJOR 3 -#define VMI_API_REV_MINOR 0 - -#define VMI_CALL_CPUID 0 -#define VMI_CALL_WRMSR 1 -#define VMI_CALL_RDMSR 2 -#define VMI_CALL_SetGDT 3 -#define VMI_CALL_SetLDT 4 -#define VMI_CALL_SetIDT 5 -#define VMI_CALL_SetTR 6 -#define VMI_CALL_GetGDT 7 -#define VMI_CALL_GetLDT 8 -#define VMI_CALL_GetIDT 9 -#define VMI_CALL_GetTR 10 -#define VMI_CALL_WriteGDTEntry 11 -#define VMI_CALL_WriteLDTEntry 12 -#define VMI_CALL_WriteIDTEntry 13 -#define VMI_CALL_UpdateKernelStack 14 -#define VMI_CALL_SetCR0 15 -#define VMI_CALL_SetCR2 16 -#define VMI_CALL_SetCR3 17 -#define VMI_CALL_SetCR4 18 -#define VMI_CALL_GetCR0 19 -#define VMI_CALL_GetCR2 20 -#define VMI_CALL_GetCR3 21 -#define VMI_CALL_GetCR4 22 -#define VMI_CALL_WBINVD 23 -#define VMI_CALL_SetDR 24 -#define VMI_CALL_GetDR 25 -#define VMI_CALL_RDPMC 26 -#define VMI_CALL_RDTSC 27 -#define VMI_CALL_CLTS 28 -#define VMI_CALL_EnableInterrupts 29 -#define VMI_CALL_DisableInterrupts 30 -#define VMI_CALL_GetInterruptMask 31 -#define VMI_CALL_SetInterruptMask 32 -#define VMI_CALL_IRET 33 -#define VMI_CALL_SYSEXIT 34 -#define VMI_CALL_Halt 35 -#define VMI_CALL_Reboot 36 -#define VMI_CALL_Shutdown 37 -#define VMI_CALL_SetPxE 38 -#define VMI_CALL_SetPxELong 39 -#define VMI_CALL_UpdatePxE 40 -#define VMI_CALL_UpdatePxELong 41 -#define VMI_CALL_MachineToPhysical 42 -#define VMI_CALL_PhysicalToMachine 43 -#define VMI_CALL_AllocatePage 44 -#define VMI_CALL_ReleasePage 45 -#define VMI_CALL_InvalPage 46 -#define VMI_CALL_FlushTLB 47 -#define VMI_CALL_SetLinearMapping 48 - -#define VMI_CALL_SetIOPLMask 61 -#define VMI_CALL_SetInitialAPState 62 -#define VMI_CALL_APICWrite 63 -#define VMI_CALL_APICRead 64 -#define VMI_CALL_IODelay 65 -#define VMI_CALL_SetLazyMode 73 - -/* - *--------------------------------------------------------------------- - * - * MMU operation flags - * - *--------------------------------------------------------------------- - */ - -/* Flags used by VMI_{Allocate|Release}Page call */ -#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */ -#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */ -#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */ - - -/* Flags shared by Allocate|Release Page and PTE updates */ -#define VMI_PAGE_PT 0x01 -#define VMI_PAGE_PD 0x02 -#define VMI_PAGE_PDP 0x04 -#define VMI_PAGE_PML4 0x08 - -#define VMI_PAGE_NORMAL 0x00 /* for debugging */ - -/* Flags used by PTE updates */ -#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */ -#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */ -#define VMI_PAGE_VA_MASK 0xfffff000 - -#ifdef CONFIG_X86_PAE -#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED) -#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED) -#else -#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED) -#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED) -#endif - -/* Flags used by VMI_FlushTLB call */ -#define VMI_FLUSH_TLB 0x01 -#define VMI_FLUSH_GLOBAL 0x02 - -/* - *--------------------------------------------------------------------- - * - * VMI relocation definitions for ROM call get_reloc - * - *--------------------------------------------------------------------- - */ - -/* VMI Relocation types */ -#define VMI_RELOCATION_NONE 0 -#define VMI_RELOCATION_CALL_REL 1 -#define VMI_RELOCATION_JUMP_REL 2 -#define VMI_RELOCATION_NOP 3 - -#ifndef __ASSEMBLY__ -struct vmi_relocation_info { - unsigned char *eip; - unsigned char type; - unsigned char reserved[3]; -}; -#endif - - -/* - *--------------------------------------------------------------------- - * - * Generic ROM structures and definitions - * - *--------------------------------------------------------------------- - */ - -#ifndef __ASSEMBLY__ - -struct vrom_header { - u16 rom_signature; /* option ROM signature */ - u8 rom_length; /* ROM length in 512 byte chunks */ - u8 rom_entry[4]; /* 16-bit code entry point */ - u8 rom_pad0; /* 4-byte align pad */ - u32 vrom_signature; /* VROM identification signature */ - u8 api_version_min;/* Minor version of API */ - u8 api_version_maj;/* Major version of API */ - u8 jump_slots; /* Number of jump slots */ - u8 reserved1; /* Reserved for expansion */ - u32 virtual_top; /* Hypervisor virtual address start */ - u16 reserved2; /* Reserved for expansion */ - u16 license_offs; /* Offset to License string */ - u16 pci_header_offs;/* Offset to PCI OPROM header */ - u16 pnp_header_offs;/* Offset to PnP OPROM header */ - u32 rom_pad3; /* PnP reserverd / VMI reserved */ - u8 reserved[96]; /* Reserved for headers */ - char vmi_init[8]; /* VMI_Init jump point */ - char get_reloc[8]; /* VMI_GetRelocationInfo jump point */ -} __attribute__((packed)); - -struct pnp_header { - char sig[4]; - char rev; - char size; - short next; - short res; - long devID; - unsigned short manufacturer_offset; - unsigned short product_offset; -} __attribute__((packed)); - -struct pci_header { - char sig[4]; - short vendorID; - short deviceID; - short vpdData; - short size; - char rev; - char class; - char subclass; - char interface; - short chunks; - char rom_version_min; - char rom_version_maj; - char codetype; - char lastRom; - short reserved; -} __attribute__((packed)); - -/* Function prototypes for bootstrapping */ -#ifdef CONFIG_VMI -extern void vmi_init(void); -extern void vmi_activate(void); -extern void vmi_bringup(void); -#else -static inline void vmi_init(void) {} -static inline void vmi_activate(void) {} -static inline void vmi_bringup(void) {} -#endif - -/* State needed to start an application processor in an SMP system. */ -struct vmi_ap_state { - u32 cr0; - u32 cr2; - u32 cr3; - u32 cr4; - - u64 efer; - - u32 eip; - u32 eflags; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esp; - u32 ebp; - u32 esi; - u32 edi; - u16 cs; - u16 ss; - u16 ds; - u16 es; - u16 fs; - u16 gs; - u16 ldtr; - - u16 gdtr_limit; - u32 gdtr_base; - u32 idtr_base; - u16 idtr_limit; -}; - -#endif diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h deleted file mode 100644 index c6e0bee93e3c..000000000000 --- a/arch/x86/include/asm/vmi_time.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * VMI Time wrappers - * - * Copyright (C) 2006, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to dhecht@vmware.com - * - */ - -#ifndef _ASM_X86_VMI_TIME_H -#define _ASM_X86_VMI_TIME_H - -/* - * Raw VMI call indices for timer functions - */ -#define VMI_CALL_GetCycleFrequency 66 -#define VMI_CALL_GetCycleCounter 67 -#define VMI_CALL_SetAlarm 68 -#define VMI_CALL_CancelAlarm 69 -#define VMI_CALL_GetWallclockTime 70 -#define VMI_CALL_WallclockUpdated 71 - -/* Cached VMI timer operations */ -extern struct vmi_timer_ops { - u64 (*get_cycle_frequency)(void); - u64 (*get_cycle_counter)(int); - u64 (*get_wallclock)(void); - int (*wallclock_updated)(void); - void (*set_alarm)(u32 flags, u64 expiry, u64 period); - void (*cancel_alarm)(u32 flags); -} vmi_timer_ops; - -/* Prototypes */ -extern void __init vmi_time_init(void); -extern unsigned long vmi_get_wallclock(void); -extern int vmi_set_wallclock(unsigned long now); -extern unsigned long long vmi_sched_clock(void); -extern unsigned long vmi_tsc_khz(void); - -#ifdef CONFIG_X86_LOCAL_APIC -extern void __devinit vmi_time_bsp_init(void); -extern void __devinit vmi_time_ap_init(void); -#endif - -/* - * When run under a hypervisor, a vcpu is always in one of three states: - * running, halted, or ready. The vcpu is in the 'running' state if it - * is executing. When the vcpu executes the halt interface, the vcpu - * enters the 'halted' state and remains halted until there is some work - * pending for the vcpu (e.g. an alarm expires, host I/O completes on - * behalf of virtual I/O). At this point, the vcpu enters the 'ready' - * state (waiting for the hypervisor to reschedule it). Finally, at any - * time when the vcpu is not in the 'running' state nor the 'halted' - * state, it is in the 'ready' state. - * - * Real time is advances while the vcpu is 'running', 'ready', or - * 'halted'. Stolen time is the time in which the vcpu is in the - * 'ready' state. Available time is the remaining time -- the vcpu is - * either 'running' or 'halted'. - * - * All three views of time are accessible through the VMI cycle - * counters. - */ - -/* The cycle counters. */ -#define VMI_CYCLES_REAL 0 -#define VMI_CYCLES_AVAILABLE 1 -#define VMI_CYCLES_STOLEN 2 - -/* The alarm interface 'flags' bits */ -#define VMI_ALARM_COUNTERS 2 - -#define VMI_ALARM_COUNTER_MASK 0x000000ff - -#define VMI_ALARM_WIRED_IRQ0 0x00000000 -#define VMI_ALARM_WIRED_LVTT 0x00010000 - -#define VMI_ALARM_IS_ONESHOT 0x00000000 -#define VMI_ALARM_IS_PERIODIC 0x00000100 - -#define CONFIG_VMI_ALARM_HZ 100 - -#endif /* _ASM_X86_VMI_TIME_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0925676266bd..801127cd9754 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -91,7 +91,6 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o -obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c3a4fbb2b996..feb4c21e499a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -83,7 +83,6 @@ #include #include #include -#include #include #include #include @@ -734,10 +733,10 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif - /* VMI may relocate the fixmap; do this before touching ioremap area */ - vmi_init(); - - /* OFW also may relocate the fixmap */ + /* + * If we have OLPC OFW, we might end up relocating the fixmap due to + * reserve_top(), so do this before touching the ioremap area. + */ olpc_ofw_detect(); early_trap_init(); @@ -838,9 +837,6 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); - /* Must be before kernel pagetables are setup */ - vmi_activate(); - /* after early param, so could get panic from serial */ reserve_early_setup_data(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8b3bfc4dd708..63a1a5596ac0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,7 +62,6 @@ #include #include #include -#include #include #include #include @@ -311,7 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused) __flush_tlb_all(); #endif - vmi_bringup(); cpu_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c deleted file mode 100644 index ce9fbacb7526..000000000000 --- a/arch/x86/kernel/vmi_32.c +++ /dev/null @@ -1,893 +0,0 @@ -/* - * VMI specific paravirt-ops implementation - * - * Copyright (C) 2005, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to zach@vmware.com - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Convenient for calling VMI functions indirectly in the ROM */ -typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); -typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int); - -#define call_vrom_func(rom,func) \ - (((VROMFUNC *)(rom->func))()) - -#define call_vrom_long_func(rom,func,arg) \ - (((VROMLONGFUNC *)(rom->func)) (arg)) - -static struct vrom_header *vmi_rom; -static int disable_pge; -static int disable_pse; -static int disable_sep; -static int disable_tsc; -static int disable_mtrr; -static int disable_noidle; -static int disable_vmi_timer; - -/* Cached VMI operations */ -static struct { - void (*cpuid)(void /* non-c */); - void (*_set_ldt)(u32 selector); - void (*set_tr)(u32 selector); - void (*write_idt_entry)(struct desc_struct *, int, u32, u32); - void (*write_gdt_entry)(struct desc_struct *, int, u32, u32); - void (*write_ldt_entry)(struct desc_struct *, int, u32, u32); - void (*set_kernel_stack)(u32 selector, u32 sp0); - void (*allocate_page)(u32, u32, u32, u32, u32); - void (*release_page)(u32, u32); - void (*set_pte)(pte_t, pte_t *, unsigned); - void (*update_pte)(pte_t *, unsigned); - void (*set_linear_mapping)(int, void *, u32, u32); - void (*_flush_tlb)(int); - void (*set_initial_ap_state)(int, int); - void (*halt)(void); - void (*set_lazy_mode)(int mode); -} vmi_ops; - -/* Cached VMI operations */ -struct vmi_timer_ops vmi_timer_ops; - -/* - * VMI patching routines. - */ -#define MNEM_CALL 0xe8 -#define MNEM_JMP 0xe9 -#define MNEM_RET 0xc3 - -#define IRQ_PATCH_INT_MASK 0 -#define IRQ_PATCH_DISABLE 5 - -static inline void patch_offset(void *insnbuf, - unsigned long ip, unsigned long dest) -{ - *(unsigned long *)(insnbuf+1) = dest-ip-5; -} - -static unsigned patch_internal(int call, unsigned len, void *insnbuf, - unsigned long ip) -{ - u64 reloc; - struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; - reloc = call_vrom_long_func(vmi_rom, get_reloc, call); - switch(rel->type) { - case VMI_RELOCATION_CALL_REL: - BUG_ON(len < 5); - *(char *)insnbuf = MNEM_CALL; - patch_offset(insnbuf, ip, (unsigned long)rel->eip); - return 5; - - case VMI_RELOCATION_JUMP_REL: - BUG_ON(len < 5); - *(char *)insnbuf = MNEM_JMP; - patch_offset(insnbuf, ip, (unsigned long)rel->eip); - return 5; - - case VMI_RELOCATION_NOP: - /* obliterate the whole thing */ - return 0; - - case VMI_RELOCATION_NONE: - /* leave native code in place */ - break; - - default: - BUG(); - } - return len; -} - -/* - * Apply patch if appropriate, return length of new instruction - * sequence. The callee does nop padding for us. - */ -static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, - unsigned long ip, unsigned len) -{ - switch (type) { - case PARAVIRT_PATCH(pv_irq_ops.irq_disable): - return patch_internal(VMI_CALL_DisableInterrupts, len, - insns, ip); - case PARAVIRT_PATCH(pv_irq_ops.irq_enable): - return patch_internal(VMI_CALL_EnableInterrupts, len, - insns, ip); - case PARAVIRT_PATCH(pv_irq_ops.restore_fl): - return patch_internal(VMI_CALL_SetInterruptMask, len, - insns, ip); - case PARAVIRT_PATCH(pv_irq_ops.save_fl): - return patch_internal(VMI_CALL_GetInterruptMask, len, - insns, ip); - case PARAVIRT_PATCH(pv_cpu_ops.iret): - return patch_internal(VMI_CALL_IRET, len, insns, ip); - case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): - return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); - default: - break; - } - return len; -} - -/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ -static void vmi_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int override = 0; - if (*ax == 1) - override = 1; - asm volatile ("call *%6" - : "=a" (*ax), - "=b" (*bx), - "=c" (*cx), - "=d" (*dx) - : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid)); - if (override) { - if (disable_pse) - *dx &= ~X86_FEATURE_PSE; - if (disable_pge) - *dx &= ~X86_FEATURE_PGE; - if (disable_sep) - *dx &= ~X86_FEATURE_SEP; - if (disable_tsc) - *dx &= ~X86_FEATURE_TSC; - if (disable_mtrr) - *dx &= ~X86_FEATURE_MTRR; - } -} - -static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) -{ - if (gdt[nr].a != new->a || gdt[nr].b != new->b) - write_gdt_entry(gdt, nr, new, 0); -} - -static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) -{ - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]); - vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]); - vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]); -} - -static void vmi_set_ldt(const void *addr, unsigned entries) -{ - unsigned cpu = smp_processor_id(); - struct desc_struct desc; - - pack_descriptor(&desc, (unsigned long)addr, - entries * sizeof(struct desc_struct) - 1, - DESC_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT); - vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); -} - -static void vmi_set_tr(void) -{ - vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); -} - -static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) -{ - u32 *idt_entry = (u32 *)g; - vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]); -} - -static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, - const void *desc, int type) -{ - u32 *gdt_entry = (u32 *)desc; - vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]); -} - -static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, - const void *desc) -{ - u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); -} - -static void vmi_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - tss->x86_tss.sp0 = thread->sp0; - - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } - vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0); -} - -static void vmi_flush_tlb_user(void) -{ - vmi_ops._flush_tlb(VMI_FLUSH_TLB); -} - -static void vmi_flush_tlb_kernel(void) -{ - vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); -} - -/* Stub to do nothing at all; used for delays and unimplemented calls */ -static void vmi_nop(void) -{ -} - -static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) -{ - vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); -} - -static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) -{ - /* - * This call comes in very early, before mem_map is setup. - * It is called only for swapper_pg_dir, which already has - * data on it. - */ - vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); -} - -static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) -{ - vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); -} - -static void vmi_release_pte(unsigned long pfn) -{ - vmi_ops.release_page(pfn, VMI_PAGE_L1); -} - -static void vmi_release_pmd(unsigned long pfn) -{ - vmi_ops.release_page(pfn, VMI_PAGE_L2); -} - -/* - * We use the pgd_free hook for releasing the pgd page: - */ -static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - unsigned long pfn = __pa(pgd) >> PAGE_SHIFT; - - vmi_ops.release_page(pfn, VMI_PAGE_L2); -} - -/* - * Helper macros for MMU update flags. We can defer updates until a flush - * or page invalidation only if the update is to the current address space - * (otherwise, there is no flush). We must check against init_mm, since - * this could be a kernel update, which usually passes init_mm, although - * sometimes this check can be skipped if we know the particular function - * is only called on user mode PTEs. We could change the kernel to pass - * current->active_mm here, but in particular, I was unsure if changing - * mm/highmem.c to do this would still be correct on other architectures. - */ -#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \ - (!mustbeuser && (mm) == &init_mm)) -#define vmi_flags_addr(mm, addr, level, user) \ - ((level) | (is_current_as(mm, user) ? \ - (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) -#define vmi_flags_addr_defer(mm, addr, level, user) \ - ((level) | (is_current_as(mm, user) ? \ - (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) - -static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); -} - -static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); -} - -static void vmi_set_pte(pte_t *ptep, pte_t pte) -{ - /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ - vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); -} - -static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) -{ - vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); -} - -static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ -#ifdef CONFIG_X86_PAE - const pte_t pte = { .pte = pmdval.pmd }; -#else - const pte_t pte = { pmdval.pud.pgd.pgd }; -#endif - vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); -} - -#ifdef CONFIG_X86_PAE - -static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) -{ - /* - * XXX This is called from set_pmd_pte, but at both PT - * and PD layers so the VMI_PAGE_PT flag is wrong. But - * it is only called for large page mapping changes, - * the Xen backend, doesn't support large pages, and the - * ESX backend doesn't depend on the flag. - */ - set_64bit((unsigned long long *)ptep,pte_val(pteval)); - vmi_ops.update_pte(ptep, VMI_PAGE_PT); -} - -static void vmi_set_pud(pud_t *pudp, pud_t pudval) -{ - /* Um, eww */ - const pte_t pte = { .pte = pudval.pgd.pgd }; - vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); -} - -static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - const pte_t pte = { .pte = 0 }; - vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); -} - -static void vmi_pmd_clear(pmd_t *pmd) -{ - const pte_t pte = { .pte = 0 }; - vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); -} -#endif - -#ifdef CONFIG_SMP -static void __devinit -vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, - unsigned long start_esp) -{ - struct vmi_ap_state ap; - - /* Default everything to zero. This is fine for most GPRs. */ - memset(&ap, 0, sizeof(struct vmi_ap_state)); - - ap.gdtr_limit = GDT_SIZE - 1; - ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid); - - ap.idtr_limit = IDT_ENTRIES * 8 - 1; - ap.idtr_base = (unsigned long) idt_table; - - ap.ldtr = 0; - - ap.cs = __KERNEL_CS; - ap.eip = (unsigned long) start_eip; - ap.ss = __KERNEL_DS; - ap.esp = (unsigned long) start_esp; - - ap.ds = __USER_DS; - ap.es = __USER_DS; - ap.fs = __KERNEL_PERCPU; - ap.gs = __KERNEL_STACK_CANARY; - - ap.eflags = 0; - -#ifdef CONFIG_X86_PAE - /* efer should match BSP efer. */ - if (cpu_has_nx) { - unsigned l, h; - rdmsr(MSR_EFER, l, h); - ap.efer = (unsigned long long) h << 32 | l; - } -#endif - - ap.cr3 = __pa(swapper_pg_dir); - /* Protected mode, paging, AM, WP, NE, MP. */ - ap.cr0 = 0x80050023; - ap.cr4 = mmu_cr4_features; - vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid); -} -#endif - -static void vmi_start_context_switch(struct task_struct *prev) -{ - paravirt_start_context_switch(prev); - vmi_ops.set_lazy_mode(2); -} - -static void vmi_end_context_switch(struct task_struct *next) -{ - vmi_ops.set_lazy_mode(0); - paravirt_end_context_switch(next); -} - -static void vmi_enter_lazy_mmu(void) -{ - paravirt_enter_lazy_mmu(); - vmi_ops.set_lazy_mode(1); -} - -static void vmi_leave_lazy_mmu(void) -{ - vmi_ops.set_lazy_mode(0); - paravirt_leave_lazy_mmu(); -} - -static inline int __init check_vmi_rom(struct vrom_header *rom) -{ - struct pci_header *pci; - struct pnp_header *pnp; - const char *manufacturer = "UNKNOWN"; - const char *product = "UNKNOWN"; - const char *license = "unspecified"; - - if (rom->rom_signature != 0xaa55) - return 0; - if (rom->vrom_signature != VMI_SIGNATURE) - return 0; - if (rom->api_version_maj != VMI_API_REV_MAJOR || - rom->api_version_min+1 < VMI_API_REV_MINOR+1) { - printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n", - rom->api_version_maj, - rom->api_version_min); - return 0; - } - - /* - * Relying on the VMI_SIGNATURE field is not 100% safe, so check - * the PCI header and device type to make sure this is really a - * VMI device. - */ - if (!rom->pci_header_offs) { - printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n"); - return 0; - } - - pci = (struct pci_header *)((char *)rom+rom->pci_header_offs); - if (pci->vendorID != PCI_VENDOR_ID_VMWARE || - pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) { - /* Allow it to run... anyways, but warn */ - printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n"); - } - - if (rom->pnp_header_offs) { - pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs); - if (pnp->manufacturer_offset) - manufacturer = (const char *)rom+pnp->manufacturer_offset; - if (pnp->product_offset) - product = (const char *)rom+pnp->product_offset; - } - - if (rom->license_offs) - license = (char *)rom+rom->license_offs; - - printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n", - manufacturer, product, - rom->api_version_maj, rom->api_version_min, - pci->rom_version_maj, pci->rom_version_min); - - /* Don't allow BSD/MIT here for now because we don't want to end up - with any binary only shim layers */ - if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) { - printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n", - license); - return 0; - } - - return 1; -} - -/* - * Probe for the VMI option ROM - */ -static inline int __init probe_vmi_rom(void) -{ - unsigned long base; - - /* VMI ROM is in option ROM area, check signature */ - for (base = 0xC0000; base < 0xE0000; base += 2048) { - struct vrom_header *romstart; - romstart = (struct vrom_header *)isa_bus_to_virt(base); - if (check_vmi_rom(romstart)) { - vmi_rom = romstart; - return 1; - } - } - return 0; -} - -/* - * VMI setup common to all processors - */ -void vmi_bringup(void) -{ - /* We must establish the lowmem mapping for MMU ops to work */ - if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0); -} - -/* - * Return a pointer to a VMI function or NULL if unimplemented - */ -static void *vmi_get_function(int vmicall) -{ - u64 reloc; - const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; - reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall); - BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); - if (rel->type == VMI_RELOCATION_CALL_REL) - return (void *)rel->eip; - else - return NULL; -} - -/* - * Helper macro for making the VMI paravirt-ops fill code readable. - * For unimplemented operations, fall back to default, unless nop - * is returned by the ROM. - */ -#define para_fill(opname, vmicall) \ -do { \ - reloc = call_vrom_long_func(vmi_rom, get_reloc, \ - VMI_CALL_##vmicall); \ - if (rel->type == VMI_RELOCATION_CALL_REL) \ - opname = (void *)rel->eip; \ - else if (rel->type == VMI_RELOCATION_NOP) \ - opname = (void *)vmi_nop; \ - else if (rel->type != VMI_RELOCATION_NONE) \ - printk(KERN_WARNING "VMI: Unknown relocation " \ - "type %d for " #vmicall"\n",\ - rel->type); \ -} while (0) - -/* - * Helper macro for making the VMI paravirt-ops fill code readable. - * For cached operations which do not match the VMI ROM ABI and must - * go through a tranlation stub. Ignore NOPs, since it is not clear - * a NOP * VMI function corresponds to a NOP paravirt-op when the - * functions are not in 1-1 correspondence. - */ -#define para_wrap(opname, wrapper, cache, vmicall) \ -do { \ - reloc = call_vrom_long_func(vmi_rom, get_reloc, \ - VMI_CALL_##vmicall); \ - BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ - if (rel->type == VMI_RELOCATION_CALL_REL) { \ - opname = wrapper; \ - vmi_ops.cache = (void *)rel->eip; \ - } \ -} while (0) - -/* - * Activate the VMI interface and switch into paravirtualized mode - */ -static inline int __init activate_vmi(void) -{ - short kernel_cs; - u64 reloc; - const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; - - /* - * Prevent page tables from being allocated in highmem, even if - * CONFIG_HIGHPTE is enabled. - */ - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - - if (call_vrom_func(vmi_rom, vmi_init) != 0) { - printk(KERN_ERR "VMI ROM failed to initialize!"); - return 0; - } - savesegment(cs, kernel_cs); - - pv_info.paravirt_enabled = 1; - pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi [deprecated]"; - - pv_init_ops.patch = vmi_patch; - - /* - * Many of these operations are ABI compatible with VMI. - * This means we can fill in the paravirt-ops with direct - * pointers into the VMI ROM. If the calling convention for - * these operations changes, this code needs to be updated. - * - * Exceptions - * CPUID paravirt-op uses pointers, not the native ISA - * halt has no VMI equivalent; all VMI halts are "safe" - * no MSR support yet - just trap and emulate. VMI uses the - * same ABI as the native ISA, but Linux wants exceptions - * from bogus MSR read / write handled - * rdpmc is not yet used in Linux - */ - - /* CPUID is special, so very special it gets wrapped like a present */ - para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID); - - para_fill(pv_cpu_ops.clts, CLTS); - para_fill(pv_cpu_ops.get_debugreg, GetDR); - para_fill(pv_cpu_ops.set_debugreg, SetDR); - para_fill(pv_cpu_ops.read_cr0, GetCR0); - para_fill(pv_mmu_ops.read_cr2, GetCR2); - para_fill(pv_mmu_ops.read_cr3, GetCR3); - para_fill(pv_cpu_ops.read_cr4, GetCR4); - para_fill(pv_cpu_ops.write_cr0, SetCR0); - para_fill(pv_mmu_ops.write_cr2, SetCR2); - para_fill(pv_mmu_ops.write_cr3, SetCR3); - para_fill(pv_cpu_ops.write_cr4, SetCR4); - - para_fill(pv_irq_ops.save_fl.func, GetInterruptMask); - para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask); - para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts); - para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts); - - para_fill(pv_cpu_ops.wbinvd, WBINVD); - para_fill(pv_cpu_ops.read_tsc, RDTSC); - - /* The following we emulate with trap and emulate for now */ - /* paravirt_ops.read_msr = vmi_rdmsr */ - /* paravirt_ops.write_msr = vmi_wrmsr */ - /* paravirt_ops.rdpmc = vmi_rdpmc */ - - /* TR interface doesn't pass TR value, wrap */ - para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR); - - /* LDT is special, too */ - para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT); - - para_fill(pv_cpu_ops.load_gdt, SetGDT); - para_fill(pv_cpu_ops.load_idt, SetIDT); - para_fill(pv_cpu_ops.store_gdt, GetGDT); - para_fill(pv_cpu_ops.store_idt, GetIDT); - para_fill(pv_cpu_ops.store_tr, GetTR); - pv_cpu_ops.load_tls = vmi_load_tls; - para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry, - write_ldt_entry, WriteLDTEntry); - para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry, - write_gdt_entry, WriteGDTEntry); - para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry, - write_idt_entry, WriteIDTEntry); - para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack); - para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); - para_fill(pv_cpu_ops.io_delay, IODelay); - - para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, - set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, - set_lazy_mode, SetLazyMode); - - para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, - set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, - set_lazy_mode, SetLazyMode); - - /* user and kernel flush are just handled with different flags to FlushTLB */ - para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); - para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); - para_fill(pv_mmu_ops.flush_tlb_single, InvalPage); - - /* - * Until a standard flag format can be agreed on, we need to - * implement these as wrappers in Linux. Get the VMI ROM - * function pointers for the two backend calls. - */ -#ifdef CONFIG_X86_PAE - vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong); - vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong); -#else - vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE); - vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE); -#endif - - if (vmi_ops.set_pte) { - pv_mmu_ops.set_pte = vmi_set_pte; - pv_mmu_ops.set_pte_at = vmi_set_pte_at; - pv_mmu_ops.set_pmd = vmi_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; - pv_mmu_ops.set_pud = vmi_set_pud; - pv_mmu_ops.pte_clear = vmi_pte_clear; - pv_mmu_ops.pmd_clear = vmi_pmd_clear; -#endif - } - - if (vmi_ops.update_pte) { - pv_mmu_ops.pte_update = vmi_update_pte; - pv_mmu_ops.pte_update_defer = vmi_update_pte_defer; - } - - vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); - if (vmi_ops.allocate_page) { - pv_mmu_ops.alloc_pte = vmi_allocate_pte; - pv_mmu_ops.alloc_pmd = vmi_allocate_pmd; - pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone; - } - - vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); - if (vmi_ops.release_page) { - pv_mmu_ops.release_pte = vmi_release_pte; - pv_mmu_ops.release_pmd = vmi_release_pmd; - pv_mmu_ops.pgd_free = vmi_pgd_free; - } - - /* Set linear is needed in all cases */ - vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); - - /* - * These MUST always be patched. Don't support indirect jumps - * through these operations, as the VMI interface may use either - * a jump or a call to get to these operations, depending on - * the backend. They are performance critical anyway, so requiring - * a patch is not a big problem. - */ - pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; - pv_cpu_ops.iret = (void *)0xbadbab0; - -#ifdef CONFIG_SMP - para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - para_fill(apic->read, APICRead); - para_fill(apic->write, APICWrite); -#endif - - /* - * Check for VMI timer functionality by probing for a cycle frequency method - */ - reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency); - if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) { - vmi_timer_ops.get_cycle_frequency = (void *)rel->eip; - vmi_timer_ops.get_cycle_counter = - vmi_get_function(VMI_CALL_GetCycleCounter); - vmi_timer_ops.get_wallclock = - vmi_get_function(VMI_CALL_GetWallclockTime); - vmi_timer_ops.wallclock_updated = - vmi_get_function(VMI_CALL_WallclockUpdated); - vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); - vmi_timer_ops.cancel_alarm = - vmi_get_function(VMI_CALL_CancelAlarm); - x86_init.timers.timer_init = vmi_time_init; -#ifdef CONFIG_X86_LOCAL_APIC - x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; - x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; -#endif - pv_time_ops.sched_clock = vmi_sched_clock; - x86_platform.calibrate_tsc = vmi_tsc_khz; - x86_platform.get_wallclock = vmi_get_wallclock; - x86_platform.set_wallclock = vmi_set_wallclock; - - /* We have true wallclock functions; disable CMOS clock sync */ - no_sync_cmos_clock = 1; - } else { - disable_noidle = 1; - disable_vmi_timer = 1; - } - - para_fill(pv_irq_ops.safe_halt, Halt); - - /* - * Alternative instruction rewriting doesn't happen soon enough - * to convert VMI_IRET to a call instead of a jump; so we have - * to do this before IRQs get reenabled. Fortunately, it is - * idempotent. - */ - apply_paravirt(__parainstructions, __parainstructions_end); - - vmi_bringup(); - - return 1; -} - -#undef para_fill - -void __init vmi_init(void) -{ - if (!vmi_rom) - probe_vmi_rom(); - else - check_vmi_rom(vmi_rom); - - /* In case probing for or validating the ROM failed, basil */ - if (!vmi_rom) - return; - - reserve_top_address(-vmi_rom->virtual_top); - -#ifdef CONFIG_X86_IO_APIC - /* This is virtual hardware; timer routing is wired correctly */ - no_timer_check = 1; -#endif -} - -void __init vmi_activate(void) -{ - unsigned long flags; - - if (!vmi_rom) - return; - - local_irq_save(flags); - activate_vmi(); - local_irq_restore(flags & X86_EFLAGS_IF); -} - -static int __init parse_vmi(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "disable_pge")) { - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); - disable_pge = 1; - } else if (!strcmp(arg, "disable_pse")) { - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE); - disable_pse = 1; - } else if (!strcmp(arg, "disable_sep")) { - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); - disable_sep = 1; - } else if (!strcmp(arg, "disable_tsc")) { - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC); - disable_tsc = 1; - } else if (!strcmp(arg, "disable_mtrr")) { - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR); - disable_mtrr = 1; - } else if (!strcmp(arg, "disable_timer")) { - disable_vmi_timer = 1; - disable_noidle = 1; - } else if (!strcmp(arg, "disable_noidle")) - disable_noidle = 1; - return 0; -} - -early_param("vmi", parse_vmi); diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c deleted file mode 100644 index 5e1ff66ecd73..000000000000 --- a/arch/x86/kernel/vmiclock_32.c +++ /dev/null @@ -1,317 +0,0 @@ -/* - * VMI paravirtual timer support routines. - * - * Copyright (C) 2007, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) -#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) - -static DEFINE_PER_CPU(struct clock_event_device, local_events); - -static inline u32 vmi_counter(u32 flags) -{ - /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding - * cycle counter. */ - return flags & VMI_ALARM_COUNTER_MASK; -} - -/* paravirt_ops.get_wallclock = vmi_get_wallclock */ -unsigned long vmi_get_wallclock(void) -{ - unsigned long long wallclock; - wallclock = vmi_timer_ops.get_wallclock(); // nsec - (void)do_div(wallclock, 1000000000); // sec - - return wallclock; -} - -/* paravirt_ops.set_wallclock = vmi_set_wallclock */ -int vmi_set_wallclock(unsigned long now) -{ - return 0; -} - -/* paravirt_ops.sched_clock = vmi_sched_clock */ -unsigned long long vmi_sched_clock(void) -{ - return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); -} - -/* x86_platform.calibrate_tsc = vmi_tsc_khz */ -unsigned long vmi_tsc_khz(void) -{ - unsigned long long khz; - khz = vmi_timer_ops.get_cycle_frequency(); - (void)do_div(khz, 1000); - return khz; -} - -static inline unsigned int vmi_get_timer_vector(void) -{ - return IRQ0_VECTOR; -} - -/** vmi clockchip */ -#ifdef CONFIG_X86_LOCAL_APIC -static unsigned int startup_timer_irq(unsigned int irq) -{ - unsigned long val = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, vmi_get_timer_vector()); - - return (val & APIC_SEND_PENDING); -} - -static void mask_timer_irq(unsigned int irq) -{ - unsigned long val = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, val | APIC_LVT_MASKED); -} - -static void unmask_timer_irq(unsigned int irq) -{ - unsigned long val = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED); -} - -static void ack_timer_irq(unsigned int irq) -{ - ack_APIC_irq(); -} - -static struct irq_chip vmi_chip __read_mostly = { - .name = "VMI-LOCAL", - .startup = startup_timer_irq, - .mask = mask_timer_irq, - .unmask = unmask_timer_irq, - .ack = ack_timer_irq -}; -#endif - -/** vmi clockevent */ -#define VMI_ALARM_WIRED_IRQ0 0x00000000 -#define VMI_ALARM_WIRED_LVTT 0x00010000 -static int vmi_wiring = VMI_ALARM_WIRED_IRQ0; - -static inline int vmi_get_alarm_wiring(void) -{ - return vmi_wiring; -} - -static void vmi_timer_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - cycle_t now, cycles_per_hz; - BUG_ON(!irqs_disabled()); - - switch (mode) { - case CLOCK_EVT_MODE_ONESHOT: - case CLOCK_EVT_MODE_RESUME: - break; - case CLOCK_EVT_MODE_PERIODIC: - cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); - (void)do_div(cycles_per_hz, HZ); - now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC)); - vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - switch (evt->mode) { - case CLOCK_EVT_MODE_ONESHOT: - vmi_timer_ops.cancel_alarm(VMI_ONESHOT); - break; - case CLOCK_EVT_MODE_PERIODIC: - vmi_timer_ops.cancel_alarm(VMI_PERIODIC); - break; - default: - break; - } - break; - default: - break; - } -} - -static int vmi_timer_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* Unfortunately, set_next_event interface only passes relative - * expiry, but we want absolute expiry. It'd be better if were - * were passed an absolute expiry, since a bunch of time may - * have been stolen between the time the delta is computed and - * when we set the alarm below. */ - cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); - - BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); - vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0); - return 0; -} - -static struct clock_event_device vmi_clockevent = { - .name = "vmi-timer", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .shift = 22, - .set_mode = vmi_timer_set_mode, - .set_next_event = vmi_timer_next_event, - .rating = 1000, - .irq = 0, -}; - -static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *evt = &__get_cpu_var(local_events); - evt->event_handler(evt); - return IRQ_HANDLED; -} - -static struct irqaction vmi_clock_action = { - .name = "vmi-timer", - .handler = vmi_timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, -}; - -static void __devinit vmi_time_init_clockevent(void) -{ - cycle_t cycles_per_msec; - struct clock_event_device *evt; - - int cpu = smp_processor_id(); - evt = &__get_cpu_var(local_events); - - /* Use cycles_per_msec since div_sc params are 32-bits. */ - cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); - (void)do_div(cycles_per_msec, 1000); - - memcpy(evt, &vmi_clockevent, sizeof(*evt)); - /* Must pick .shift such that .mult fits in 32-bits. Choosing - * .shift to be 22 allows 2^(32-22) cycles per nano-seconds - * before overflow. */ - evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift); - /* Upper bound is clockevent's use of ulong for cycle deltas. */ - evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); - evt->min_delta_ns = clockevent_delta2ns(1, evt); - evt->cpumask = cpumask_of(cpu); - - printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", - evt->name, evt->mult, evt->shift); - clockevents_register_device(evt); -} - -void __init vmi_time_init(void) -{ - unsigned int cpu; - /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ - outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ - - vmi_time_init_clockevent(); - setup_irq(0, &vmi_clock_action); - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; -} - -#ifdef CONFIG_X86_LOCAL_APIC -void __devinit vmi_time_bsp_init(void) -{ - /* - * On APIC systems, we want local timers to fire on each cpu. We do - * this by programming LVTT to deliver timer events to the IRQ handler - * for IRQ-0, since we can't re-use the APIC local timer handler - * without interfering with that code. - */ - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); - local_irq_disable(); -#ifdef CONFIG_SMP - /* - * XXX handle_percpu_irq only defined for SMP; we need to switch over - * to using it, since this is a local interrupt, which each CPU must - * handle individually without locking out or dropping simultaneous - * local timers on other CPUs. We also don't want to trigger the - * quirk workaround code for interrupts which gets invoked from - * handle_percpu_irq via eoi, so we use our own IRQ chip. - */ - set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt"); -#else - set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt"); -#endif - vmi_wiring = VMI_ALARM_WIRED_LVTT; - apic_write(APIC_LVTT, vmi_get_timer_vector()); - local_irq_enable(); - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); -} - -void __devinit vmi_time_ap_init(void) -{ - vmi_time_init_clockevent(); - apic_write(APIC_LVTT, vmi_get_timer_vector()); -} -#endif - -/** vmi clocksource */ -static struct clocksource clocksource_vmi; - -static cycle_t read_real_cycles(struct clocksource *cs) -{ - cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); - return max(ret, clocksource_vmi.cycle_last); -} - -static struct clocksource clocksource_vmi = { - .name = "vmi-timer", - .rating = 450, - .read = read_real_cycles, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be set */ - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -static int __init init_vmi_clocksource(void) -{ - cycle_t cycles_per_msec; - - if (!vmi_timer_ops.get_cycle_frequency) - return 0; - /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */ - cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); - (void)do_div(cycles_per_msec, 1000); - - /* Note that clocksource.{mult, shift} converts in the opposite direction - * as clockevents. */ - clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, - clocksource_vmi.shift); - - printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec); - return clocksource_register(&clocksource_vmi); - -} -module_init(init_vmi_clocksource); -- cgit v1.2.3 From b0f4c062fb6dd4c02b1fe6de73319ed50a09b27d Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 23 Aug 2010 17:05:57 -0700 Subject: x86, paravirt: Remove alloc_pmd_clone hook, only used by VMI VMI was the only user of the alloc_pmd_clone hook, given that VMI is now removed we can also remove this hook. Signed-off-by: Alok N Kataria LKML-Reference: <1282608357.19396.36.camel@ank32.eng.vmware.com> Cc: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 5 ----- arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/kernel/paravirt.c | 1 - arch/x86/mm/pgtable.c | 4 ---- arch/x86/xen/mmu.c | 1 - 5 files changed, 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5653f43d90e5..edecb4ed2210 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); } -static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, - unsigned long start, unsigned long count) -{ - PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count); -} static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index db9ef5532341..b82bac975250 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -255,7 +255,6 @@ struct pv_mmu_ops { */ void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); void (*release_pte)(unsigned long pfn); void (*release_pmd)(unsigned long pfn); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1db183ed7c01..c5b250011fd4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = { .alloc_pte = paravirt_nop, .alloc_pmd = paravirt_nop, - .alloc_pmd_clone = paravirt_nop, .alloc_pud = paravirt_nop, .release_pte = paravirt_nop, .release_pmd = paravirt_nop, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..a96023e872ae 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -98,10 +98,6 @@ static void pgd_ctor(pgd_t *pgd) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42086ac406af..b2363fcbcd0f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1969,7 +1969,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, .alloc_pmd = xen_alloc_pmd_init, - .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, #ifdef CONFIG_X86_64 -- cgit v1.2.3 From d0cd7425fab774a480cce17c2f649984312d0b55 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 24 Aug 2010 17:32:04 -0700 Subject: x86, bios: By default, reserve the low 64K for all BIOSes The laundry list of BIOSes that need the low 64K reserved is getting very long, so make it the default across all BIOSes. This also allows the code to be simplified and unified with the reservation code for the first 4K. This resolves kernel bugzilla 16661 and who knows what else... Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/Kconfig | 47 ++++++++++++++++----------- arch/x86/kernel/setup.c | 84 +++++-------------------------------------------- 2 files changed, 35 insertions(+), 96 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..683ae8f9bd0b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1326,25 +1326,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK Set whether the default state of memory_corruption_check is on or off. -config X86_RESERVE_LOW_64K - bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" - default y - ---help--- - Reserve the first 64K of physical RAM on BIOSes that are known - to potentially corrupt that memory range. A numbers of BIOSes are - known to utilize this area during suspend/resume, so it must not - be used by the kernel. - - Set this to N if you are absolutely sure that you trust the BIOS - to get all its memory reservations and usages right. - - If you have doubts about the BIOS (e.g. suspend/resume does not - work or there's kernel crashes after certain hardware hotplug - events) and it's not AMI or Phoenix, then you might want to enable - X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical - corruption patterns. - - Say Y if unsure. +config X86_LOW_RESERVE + int "Amount of low memory, in kilobytes, to reserve for the BIOS" + default 64 + range 4 640 + ---help--- + Specify the amount of low memory to reserve for the BIOS. + + The first page contains BIOS data structures that the kernel + must not use, so that page must always be reserved. + + By default we reserve the first 64K of physical RAM, as a + number of BIOSes are known to corrupt that memory range + during events such as suspend/resume or monitor cable + insertion, so it must not be used by the kernel. + + You can set this to 4 if you are absolutely sure that you + trust the BIOS to get all its memory reservations and usages + right. If you know your BIOS have problems beyond the + default 64K area, you can set this to 640 to avoid using the + entire low memory range. + + If you have doubts about the BIOS (e.g. suspend/resume does + not work or there's kernel crashes after certain hardware + hotplug events) then you might want to enable + X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check + typical corruption patterns. + + Leave this to the default value of 64 if you are unsure. config MATH_EMULATION bool diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c3a4fbb2b996..eb87f1c83f91 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -618,88 +618,20 @@ static __init void reserve_ibft_region(void) reserve_early_overlap_ok(addr, addr + size, "ibft"); } -#ifdef CONFIG_X86_RESERVE_LOW_64K -static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE - "%s detected: BIOS may corrupt low RAM, working around it.\n", - d->ident); - - e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - - return 0; -} -#endif - -/* List of systems that have known low memory corruption BIOS problems */ -static struct dmi_system_id __initdata bad_bios_dmi_table[] = { -#ifdef CONFIG_X86_RESERVE_LOW_64K - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix/MSC BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), - }, - }, - /* - * AMI BIOS with low memory corruption was found on Intel DG45ID and - * DG45FC boards. - * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will - * match only DMI_BOARD_NAME and see if there is more bad products - * with this vendor. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), - }, - }, - /* - * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so - * match on the product name. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), - }, - }, -#endif - {} -}; - static void __init trim_bios_range(void) { /* * A special case is the first 4Kb of memory; * This is a BIOS owned area, not kernel ram, but generally * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_LOW_RESERVE. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820_update_range(0, ALIGN(CONFIG_X86_LOW_RESERVE << 10, PAGE_SIZE), + E820_RAM, E820_RESERVED); + /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. @@ -863,8 +795,6 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); - dmi_check_system(bad_bios_dmi_table); - /* * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. -- cgit v1.2.3 From acf01734b1747b1ec4be6f159aff579ea5f7f8e2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 25 Aug 2010 18:28:23 +0200 Subject: x86, tsc: Remove CPU frequency calibration on AMD 6b37f5a20c0e5c334c010a587058354215433e92 introduced the CPU frequency calibration code for AMD CPUs whose TSCs didn't increment with the core's P0 frequency. From F10h, revB onward, however, the TSC increment rate is denoted by MSRC001_0015[24] and when this bit is set (which should be done by the BIOS) the TSC increments with the P0 frequency so the calibration is not needed and booting can be a couple of mcecs faster on those machines. Besides, there should be virtually no machines out there which don't have this bit set, therefore this calibration can be safely removed. It is a shaky hack anyway since it assumes implicitly that the core is in P0 when BIOS hands off to the OS, which might not always be the case. Signed-off-by: Borislav Petkov LKML-Reference: <20100825162823.GE26438@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 17 ++++++++++++++ arch/x86/kernel/tsc.c | 58 ----------------------------------------------- 2 files changed, 17 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ba5f62f45f01..fc563fabde67 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -412,6 +412,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); } #endif + + /* We need to do the following only once */ + if (c != &boot_cpu_data) + return; + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ce8e50239332..13b6a6cc77f2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -854,60 +854,6 @@ static void __init init_tsc_clocksource(void) clocksource_register_khz(&clocksource_tsc, tsc_khz); } -#ifdef CONFIG_X86_64 -/* - * calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency - */ -#define TICK_COUNT 100000000 -static unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} -#else -static inline unsigned long calibrate_cpu(void) { return cpu_khz; } -#endif - void __init tsc_init(void) { u64 lpj; @@ -926,10 +872,6 @@ void __init tsc_init(void) return; } - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calibrate_cpu(); - printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); -- cgit v1.2.3 From 9ea77bdb39b62c9bf9fd3cdd1c25a9420bccd380 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 25 Aug 2010 16:38:20 -0700 Subject: x86, bios: Make the x86 early memory reservation a kernel option Add a kernel command-line option so the x86 early memory reservation size can be adjusted at runtime instead of only at compile time. Suggested-by: Andrew Morton LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- arch/x86/kernel/setup.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 683ae8f9bd0b..d3590008c5dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1326,7 +1326,7 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK Set whether the default state of memory_corruption_check is on or off. -config X86_LOW_RESERVE +config X86_RESERVE_LOW int "Amount of low memory, in kilobytes, to reserve for the BIOS" default 64 range 4 640 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index eb87f1c83f91..af277e369def 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -618,6 +618,8 @@ static __init void reserve_ibft_region(void) reserve_early_overlap_ok(addr, addr + size, "ibft"); } +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; + static void __init trim_bios_range(void) { /* @@ -627,9 +629,9 @@ static void __init trim_bios_range(void) * * This typically reserves additional memory (64KiB by default) * since some BIOSes are known to corrupt low memory. See the - * Kconfig help text for X86_LOW_RESERVE. + * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, ALIGN(CONFIG_X86_LOW_RESERVE << 10, PAGE_SIZE), + e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), E820_RAM, E820_RESERVED); /* @@ -641,6 +643,28 @@ static void __init trim_bios_range(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +static int __init parse_reservelow(char *p) +{ + unsigned long long size; + + if (!p) + return -EINVAL; + + size = memparse(p, &p); + + if (size < 4096) + size = 4096; + + if (size > 640*1024) + size = 640*1024; + + reserve_low = size; + + return 0; +} + +early_param("reservelow", parse_reservelow); + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures -- cgit v1.2.3 From 6afb5157b9eba4092e2f0f54d24a3806409bdde5 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Wed, 19 May 2010 17:42:14 +0800 Subject: x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions No behavior change. Move some of vmalloc_sync_all() code into a new function sync_global_pgds() that will be useful for memory hotplug. Signed-off-by: Haicheng Li LKML-Reference: <4C6E4ECD.1090607@linux.intel.com> Reviewed-by: Wu Fengguang Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_64.h | 2 ++ arch/x86/mm/fault.c | 24 +----------------------- arch/x86/mm/init_64.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 076052cd62be..f96ac9bedf75 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } +extern void sync_global_pgds(unsigned long start, unsigned long end); + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..51f7ee71d6c7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -326,29 +326,7 @@ out: void vmalloc_sync_all(void) { - unsigned long address; - - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..61a1b4fdecbf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -97,6 +97,36 @@ static int __init nonx32_setup(char *str) } __setup("noexec32=", nonx32_setup); +/* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. -- cgit v1.2.3 From 9b861528a8012e7bc4d1f7bae07395b225331477 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Fri, 20 Aug 2010 17:50:16 +0800 Subject: x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes When memory hotplug-adding happens for a large enough area that a new PGD entry is needed for the direct mapping, the PGDs of other processes would not get updated. This leads to some CPUs oopsing like below when they have to access the unmapped areas. [ 1139.243192] BUG: soft lockup - CPU#0 stuck for 61s! [bash:6534] [ 1139.243195] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore [ 1139.243229] irq event stamp: 8538759 [ 1139.243230] hardirqs last enabled at (8538759): [] restore_args+0x0/0x30 [ 1139.243236] hardirqs last disabled at (8538757): [] __do_softirq+0x106/0x146 [ 1139.243240] softirqs last enabled at (8538758): [] __do_softirq+0x137/0x146 [ 1139.243245] softirqs last disabled at (8538743): [] call_softirq+0x1c/0x34 [ 1139.243249] CPU 0: [ 1139.243250] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore [ 1139.243284] Pid: 6534, comm: bash Tainted: G M 2.6.32-haicheng-cpuhp #7 QSSC-S4R [ 1139.243287] RIP: 0010:[] [] alloc_arraycache+0x35/0x69 [ 1139.243292] RSP: 0018:ffff8802799f9d78 EFLAGS: 00010286 [ 1139.243295] RAX: ffff8884ffc00000 RBX: ffff8802799f9d98 RCX: 0000000000000000 [ 1139.243297] RDX: 0000000000190018 RSI: 0000000000000001 RDI: ffff8884ffc00010 [ 1139.243300] RBP: ffffffff8100c34e R08: 0000000000000002 R09: 0000000000000000 [ 1139.243303] R10: ffffffff8246dda0 R11: 000000d08246dda0 R12: ffff8802599bfff0 [ 1139.243305] R13: ffff88027904c040 R14: ffff8802799f8000 R15: 0000000000000001 [ 1139.243308] FS: 00007fe81bfe86e0(0000) GS:ffff88000d800000(0000) knlGS:0000000000000000 [ 1139.243311] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1139.243313] CR2: ffff8884ffc00000 CR3: 000000026cf2d000 CR4: 00000000000006f0 [ 1139.243316] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1139.243318] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1139.243321] Call Trace: [ 1139.243324] [] ? alloc_arraycache+0x29/0x69 [ 1139.243328] [] ? cpuup_callback+0x1b0/0x32a [ 1139.243333] [] ? notifier_call_chain+0x33/0x5b [ 1139.243337] [] ? __raw_notifier_call_chain+0x9/0xb [ 1139.243340] [] ? cpu_up+0xb3/0x152 [ 1139.243344] [] ? store_online+0x4d/0x75 [ 1139.243348] [] ? sysdev_store+0x1b/0x1d [ 1139.243351] [] ? sysfs_write_file+0xe5/0x121 [ 1139.243355] [] ? vfs_write+0xae/0x14a [ 1139.243358] [] ? sys_write+0x47/0x6f [ 1139.243362] [] ? system_call_fastpath+0x16/0x1b This patch makes sure to always replicate new direct mapping PGD entries to the PGDs of all processes, as well as ensures corresponding vmemmap mapping gets synced. V1: initial code by Andi Kleen. V2: fix several issues found in testing. V3: as suggested by Wu Fengguang, reuse common code of vmalloc_sync_all(). [ hpa: changed pgd_change from int to bool ] Originally-by: Andi Kleen Signed-off-by: Haicheng Li LKML-Reference: <4C6E4FD8.6080100@linux.intel.com> Reviewed-by: Wu Fengguang Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 61a1b4fdecbf..64e7bc2ded93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -564,8 +564,9 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - + bool pgd_changed = false; unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -593,7 +594,12 @@ kernel_physical_mapping_init(unsigned long start, spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; } + + if (pgd_changed) + sync_global_pgds(addr, end); + __flush_tlb_all(); return last_map_addr; @@ -1033,6 +1039,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) } } + sync_global_pgds((unsigned long)start_page, end); return 0; } -- cgit v1.2.3 From 660a293ea9be709b893d371fbc0328fcca33c33a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 27 Jul 2010 16:06:28 +0800 Subject: x86, mm: Make spurious_fault check explicitly check the PRESENT bit pte_present() returns true even present bit isn't set but _PAGE_PROTNONE (global bit) bit is set. While with CONFIG_DEBUG_PAGEALLOC, free pages have global bit set but present bit clear. This patch makes we could catch free pages access with CONFIG_DEBUG_PAGEALLOC enabled. [ hpa: added a comment in the code as a warning to janitors ] Signed-off-by: Shaohua Li LKML-Reference: <1280217988.32400.75.camel@sli10-desk.sh.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 51f7ee71d6c7..caec22906d7c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -872,8 +872,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (pmd_large(*pmd)) return spurious_fault_check(error_code, (pte_t *) pmd); + /* + * Note: don't use pte_present() here, since it returns true + * if the _PAGE_PROTNONE bit is set. However, this aliases the + * _PAGE_GLOBAL bit, which for kernel pages give false positives + * when CONFIG_DEBUG_PAGEALLOC is used. + */ pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) + if (!(pte_flags(*pte) & _PAGE_PRESENT)) return 0; ret = spurious_fault_check(error_code, pte); -- cgit v1.2.3 From 9fbaf49c7f717740002d49eee1bbd03d89d8766a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 28 Aug 2010 17:41:03 +0200 Subject: x86, kmemcheck: Remove double test The opcodes 0x2e and 0x3e are tested for in the first Group 2 line as well. The sematic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @expression@ expression E; @@ ( * E || ... || E | * E && ... && E ) // Signed-off-by: Julia Lawall Reviewed-by: Pekka Enberg Cc: Vegard Nossum LKML-Reference: <1283010066-20935-5-git-send-email-julia@diku.dk> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmemcheck/opcode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 63c19e27aa6f..324aa3f07237 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b) b == 0xf0 || b == 0xf2 || b == 0xf3 /* Group 2 */ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 - || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e + || b == 0x64 || b == 0x65 /* Group 3 */ || b == 0x66 /* Group 4 */ -- cgit v1.2.3 From e6b04b6b5a3182ae36cf9a69f1aaaee432edc8ad Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 13:52:45 +0100 Subject: x86-64: Fix unwind annotations in syscall stubs With the return address removed from the stack, these should really refer to their caller's register state. Signed-off-by: Jan Beulich Acked-by: Alexander van Heukelum LKML-Reference: <4C7FBA3D0200007800013F61@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 17be5ec7cbba..16aeff0c3154 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -714,9 +714,8 @@ END(ptregscall_common) ENTRY(stub_execve) CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 + addq $8, %rsp + PARTIAL_FRAME 0 SAVE_REST FIXUP_TOP_OF_STACK %r11 movq %rsp, %rcx @@ -735,7 +734,7 @@ END(stub_execve) ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 + PARTIAL_FRAME 0 SAVE_REST movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 @@ -1445,7 +1444,6 @@ error_swapgs: error_sti: TRACE_IRQS_OFF ret - CFI_ENDPROC /* * There are two places in the kernel that can potentially fault with @@ -1470,6 +1468,7 @@ bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) jmp error_swapgs + CFI_ENDPROC END(error_entry) -- cgit v1.2.3 From 1f130a783a796f147b080c594488b566c86007d0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 13:54:32 +0100 Subject: x86-64: Adjust frame type at paranoid_exit: As this isn't an exception or interrupt entry point, it doesn't have any of the hardware provide frame layouts active. Signed-off-by: Jan Beulich Acked-by: Alexander van Heukelum LKML-Reference: <4C7FBAA80200007800013F67@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 16aeff0c3154..64dfe3045c13 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1367,7 +1367,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) /* ebx: no swapgs flag */ ENTRY(paranoid_exit) - INTR_FRAME + DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %ebx,%ebx /* swapgs needed? */ -- cgit v1.2.3 From b1cccb1bb01dc1cb89f58723a58c3d4988d44d94 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 13:55:11 +0100 Subject: x86-64: Use symbolics instead of raw numbers in entry_64.S ... making the code a little less fragile. Also use pushq_cfi instead of raw CFI annotations in two more places, and add two missing annotations after stack pointer adjustments which got modified here anyway. Signed-off-by: Jan Beulich Acked-by: Alexander van Heukelum LKML-Reference: <4C7FBACF0200007800013F6A@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 64dfe3045c13..6f305830c80c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -795,8 +795,8 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - subq $10*8, %rsp - CFI_ADJUST_CFA_OFFSET 10*8 + subq $ORIG_RAX-ARGOFFSET+8, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 call save_args PARTIAL_FRAME 0 call \func @@ -1035,8 +1035,8 @@ ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -1051,9 +1051,9 @@ END(\sym) ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 /* ORIG_RAX: no syscall to restart */ - CFI_ADJUST_CFA_OFFSET 8 - subq $15*8, %rsp + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ @@ -1069,9 +1069,9 @@ END(\sym) ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 /* ORIG_RAX: no syscall to restart */ - CFI_ADJUST_CFA_OFFSET 8 - subq $15*8, %rsp + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ @@ -1088,8 +1088,8 @@ END(\sym) ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -1106,8 +1106,8 @@ END(\sym) ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid DEFAULT_FRAME 0 TRACE_IRQS_OFF @@ -1497,8 +1497,8 @@ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 - subq $15*8, %rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ -- cgit v1.2.3 From a34107b5577968dc53cf9c2195c7c2d4a2caf9ce Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 14:04:16 +0100 Subject: i386: Add unwind directives to syscall ptregs stubs When these stubs are actual functions (i.e. having a return instruction) and have stack manipulation instructions in them, they should also be annotated to allow unwinding through them. Signed-off-by: Jan Beulich Acked-by: Alexander van Heukelum LKML-Reference: <4C7FBCF00200007800013F99@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 227d00920d2f..d9b950ee5590 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -750,14 +750,18 @@ ptregs_##name: \ #define PTREGSCALL3(name) \ ALIGN; \ ptregs_##name: \ + CFI_STARTPROC; \ leal 4(%esp),%eax; \ - pushl %eax; \ + pushl_cfi %eax; \ movl PT_EDX(%eax),%ecx; \ movl PT_ECX(%eax),%edx; \ movl PT_EBX(%eax),%eax; \ call sys_##name; \ addl $4,%esp; \ - ret + CFI_ADJUST_CFA_OFFSET -4; \ + ret; \ + CFI_ENDPROC; \ +ENDPROC(ptregs_##name) PTREGSCALL1(iopl) PTREGSCALL0(fork) @@ -772,15 +776,19 @@ PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ ALIGN; ptregs_clone: + CFI_STARTPROC leal 4(%esp),%eax - pushl %eax - pushl PT_EDI(%eax) + pushl_cfi %eax + pushl_cfi PT_EDI(%eax) movl PT_EDX(%eax),%ecx movl PT_ECX(%eax),%edx movl PT_EBX(%eax),%eax call sys_clone addl $8,%esp + CFI_ADJUST_CFA_OFFSET -8 ret + CFI_ENDPROC +ENDPROC(ptregs_clone) .macro FIXUP_ESPFIX_STACK /* -- cgit v1.2.3 From df5d1874ce1a1f0e0eceff4fa3a9d45620243a68 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 14:07:16 +0100 Subject: x86: Use {push,pop}{l,q}_cfi in more places ... plus additionally introduce {push,pop}f{l,q}_cfi. All in the hope that the code becomes better readable this way (it gets quite a bit smaller in any case). Signed-off-by: Jan Beulich Acked-by: Alexander van Heukelum LKML-Reference: <4C7FBDA40200007800013FAF@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dwarf2.h | 20 +++ arch/x86/kernel/entry_32.S | 294 ++++++++++++++---------------------------- arch/x86/kernel/entry_64.S | 65 ++++------ 3 files changed, 141 insertions(+), 238 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 733f7e91e7a9..326099199318 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -89,6 +89,16 @@ CFI_ADJUST_CFA_OFFSET -8 .endm + .macro pushfq_cfi + pushfq + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro popfq_cfi + popfq + CFI_ADJUST_CFA_OFFSET -8 + .endm + .macro movq_cfi reg offset=0 movq %\reg, \offset(%rsp) CFI_REL_OFFSET \reg, \offset @@ -109,6 +119,16 @@ CFI_ADJUST_CFA_OFFSET -4 .endm + .macro pushfl_cfi + pushfl + CFI_ADJUST_CFA_OFFSET 4 + .endm + + .macro popfl_cfi + popfl + CFI_ADJUST_CFA_OFFSET -4 + .endm + .macro movl_cfi reg offset=0 movl %\reg, \offset(%esp) CFI_REL_OFFSET \reg, \offset diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index d9b950ee5590..9fb188d7bc76 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -115,8 +115,7 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 .endm .macro POP_GS pop=0 addl $(4 + \pop), %esp @@ -140,14 +139,12 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl %gs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %gs /*CFI_REL_OFFSET gs, 0*/ .endm .macro POP_GS pop=0 -98: popl %gs - CFI_ADJUST_CFA_OFFSET -4 +98: popl_cfi %gs /*CFI_RESTORE gs*/ .if \pop <> 0 add $\pop, %esp @@ -195,35 +192,25 @@ .macro SAVE_ALL cld PUSH_GS - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0;*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0;*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0;*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl $(__USER_DS), %edx movl %edx, %ds @@ -234,39 +221,29 @@ .endm .macro RESTORE_INT_REGS - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx CFI_RESTORE edx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebp CFI_RESTORE ebp - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax CFI_RESTORE eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl %ds - CFI_ADJUST_CFA_OFFSET -4 +1: popl_cfi %ds /*CFI_RESTORE ds;*/ -2: popl %es - CFI_ADJUST_CFA_OFFSET -4 +2: popl_cfi %es /*CFI_RESTORE es;*/ -3: popl %fs - CFI_ADJUST_CFA_OFFSET -4 +3: popl_cfi %fs /*CFI_RESTORE fs;*/ POP_GS \pop .pushsection .fixup, "ax" @@ -320,16 +297,12 @@ ENTRY(ret_from_fork) CFI_STARTPROC - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - pushl $0x0202 # Reset kernel eflags - CFI_ADJUST_CFA_OFFSET 4 - popfl - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax + pushl_cfi $0x0202 # Reset kernel eflags + popfl_cfi jmp syscall_exit CFI_ENDPROC END(ret_from_fork) @@ -409,29 +382,23 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl $(__USER_DS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $(__USER_DS) /*CFI_REL_OFFSET ss, 0*/ - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET esp, 0 - pushfl + pushfl_cfi orl $X86_EFLAGS_IF, (%esp) - CFI_ADJUST_CFA_OFFSET 4 - pushl $(__USER_CS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $(__USER_CS) /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -486,8 +453,7 @@ sysenter_audit: movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ call audit_syscall_entry - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway - pushl %eax # save orig_eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation @@ -566,7 +531,6 @@ restore_all_notrace: je ldt_ss # returning to user-space with LDT SS restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code - CFI_ADJUST_CFA_OFFSET -4 irq_return: INTERRUPT_RETURN .section .fixup,"ax" @@ -619,10 +583,8 @@ ldt_ss: shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - CFI_ADJUST_CFA_OFFSET 4 - push %eax /* new kernel esp */ - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__ESPFIX_SS + pushl_cfi %eax /* new kernel esp */ /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ @@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and ALIGN work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx # save ti_flags for do_notify_resume call save_v86_state # %eax contains pt_regs pointer - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx movl %eax, %esp #else movl %esp, %eax @@ -803,10 +763,8 @@ ENDPROC(ptregs_clone) mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - CFI_ADJUST_CFA_OFFSET 4 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__KERNEL_DS + pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm @@ -843,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 .endif -1: pushl $(~vector+0x80) /* Note: always in signed byte range */ - CFI_ADJUST_CFA_OFFSET 4 +1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif @@ -884,8 +841,7 @@ ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ - pushl $~(nr); \ - CFI_ADJUST_CFA_OFFSET 4; \ + pushl_cfi $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ @@ -901,21 +857,18 @@ ENDPROC(name) ENTRY(coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_error jmp error_code CFI_ENDPROC END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl $do_general_protection +661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" .balign 4 @@ -930,19 +883,16 @@ ENTRY(simd_coprocessor_error) 664: .previous #else - pushl $do_simd_coprocessor_error + pushl_cfi $do_simd_coprocessor_error #endif - CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_device_not_available - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int + pushl_cfi $do_device_not_available jmp error_code CFI_ENDPROC END(device_not_available) @@ -964,82 +914,68 @@ END(native_irq_enable_sysexit) ENTRY(overflow) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_overflow - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_overflow jmp error_code CFI_ENDPROC END(overflow) ENTRY(bounds) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_bounds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_bounds jmp error_code CFI_ENDPROC END(bounds) ENTRY(invalid_op) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_invalid_op - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_invalid_op jmp error_code CFI_ENDPROC END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_segment_overrun - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_segment_overrun jmp error_code CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME - pushl $do_invalid_TSS - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_invalid_TSS jmp error_code CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME - pushl $do_segment_not_present - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_segment_not_present jmp error_code CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME - pushl $do_stack_segment - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_stack_segment jmp error_code CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME - pushl $do_alignment_check - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_alignment_check jmp error_code CFI_ENDPROC END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 # no error code + pushl_cfi $do_divide_error jmp error_code CFI_ENDPROC END(divide_error) @@ -1047,10 +983,8 @@ END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi machine_check_vector jmp error_code CFI_ENDPROC END(machine_check) @@ -1058,10 +992,8 @@ END(machine_check) ENTRY(spurious_interrupt_bug) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_spurious_interrupt_bug - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_spurious_interrupt_bug jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) @@ -1092,8 +1024,7 @@ ENTRY(xen_sysenter_target) ENTRY(xen_hypervisor_callback) CFI_STARTPROC - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 SAVE_ALL TRACE_IRQS_OFF @@ -1129,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback) # We distinguish between categories by maintaining a status value in EAX. ENTRY(xen_failsafe_callback) CFI_STARTPROC - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl $1,%eax 1: mov 4(%esp),%ds 2: mov 8(%esp),%es 3: mov 12(%esp),%fs 4: mov 16(%esp),%gs testl %eax,%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f addl $16,%esp jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) -5: pushl $0 # EAX == 0 => Category 1 (Bad segment) - CFI_ADJUST_CFA_OFFSET 4 +5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment) SAVE_ALL jmp ret_from_exception CFI_ENDPROC @@ -1295,40 +1223,29 @@ syscall_table_size=(.-sys_call_table) ENTRY(page_fault) RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 cld movl $(__KERNEL_PERCPU), %ecx @@ -1370,12 +1287,9 @@ END(page_fault) movl TSS_sysenter_sp0 + \offset(%esp), %esp CFI_DEF_CFA esp, 0 CFI_UNDEFINED eip - pushfl - CFI_ADJUST_CFA_OFFSET 4 - pushl $__KERNEL_CS - CFI_ADJUST_CFA_OFFSET 4 - pushl $sysenter_past_esp - CFI_ADJUST_CFA_OFFSET 4 + pushfl_cfi + pushl_cfi $__KERNEL_CS + pushl_cfi $sysenter_past_esp CFI_REL_OFFSET eip, 0 .endm @@ -1385,8 +1299,7 @@ ENTRY(debug) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # error code 0 @@ -1406,32 +1319,27 @@ END(debug) */ ENTRY(nmi) RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax je nmi_espfix_stack cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. */ andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax jae nmi_stack_correct cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer @@ -1460,18 +1368,14 @@ nmi_espfix_stack: * * create the pointer to lss back */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ss + pushl_cfi %esp addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi 16(%esp) .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code @@ -1485,8 +1389,7 @@ END(nmi) ENTRY(int3) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # zero error code @@ -1498,8 +1401,7 @@ END(int3) ENTRY(general_protection) RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_general_protection jmp error_code CFI_ENDPROC END(general_protection) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6f305830c80c..8851a2bb8c0b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64) .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ xorl %eax, %eax - pushq $__KERNEL_DS /* ss */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_DS /* ss */ /*CFI_REL_OFFSET ss,0*/ - pushq %rax /* rsp */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq $X86_EFLAGS_IF /* eflags - interrupts on */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ - pushq $__KERNEL_CS /* cs */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ - pushq \child_rip /* rip */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi \child_rip /* rip */ CFI_REL_OFFSET rip,0 - pushq %rax /* orig rax */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* orig rax */ .endm .macro UNFAKE_STACK_FRAME @@ -398,10 +392,8 @@ ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK,TI_flags(%r8) - push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -8 + pushq_cfi kernel_eflags(%rip) + popfq_cfi # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -521,11 +513,9 @@ sysret_careful: jnc sysret_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi jmp sysret_check /* Handle a signal */ @@ -634,11 +624,9 @@ int_careful: jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -652,12 +640,10 @@ int_check_syscall_exit_work: /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest @@ -765,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -8 .endif -1: pushq $(~vector+0x80) /* Note: always in signed byte range */ - CFI_ADJUST_CFA_OFFSET 8 +1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif @@ -821,6 +806,7 @@ ret_from_intr: TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 exit_intr: @@ -902,11 +888,9 @@ retint_careful: jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -955,8 +939,7 @@ END(common_interrupt) .macro apicinterrupt num sym do_sym ENTRY(\sym) INTR_FRAME - pushq $~(\num) - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $~(\num) interrupt \do_sym jmp ret_from_intr CFI_ENDPROC @@ -1138,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error /* edi: new selector */ ENTRY(native_load_gs_index) CFI_STARTPROC - pushf - CFI_ADJUST_CFA_OFFSET 8 + pushfq_cfi DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popf - CFI_ADJUST_CFA_OFFSET -8 + popfq_cfi ret CFI_ENDPROC END(native_load_gs_index) @@ -1214,8 +1195,7 @@ END(kernel_execve) /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - push %rbp - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rbp CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1224,6 +1204,7 @@ ENTRY(call_softirq) push %rbp # backlink for old unwinder call __do_softirq leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) -- cgit v1.2.3 From 7fe977dab356fbd7e86aa10bf83891761107c57c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 14:01:58 +0100 Subject: i386: Make kernel_execve() suitable for stack unwinding The explicit saving and restoring of %ebx was confusing stack unwind data consumers, and it is plain unnecessary to do this within the asm(), since that was only introduced for PIC user mode consumers of the original _syscall3() macro this was derived from. Signed-off-by: Jan Beulich Cc: Arnd Bergmann LKML-Reference: <4C7FBC660200007800013F95@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/sys_i386_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index d5e06624e34a..0b0cb5fede19 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -33,8 +33,8 @@ int kernel_execve(const char *filename, const char *const envp[]) { long __res; - asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" + asm volatile ("int $0x80" : "=a" (__res) - : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); + : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory"); return __res; } -- cgit v1.2.3 From 1c5f50ee347daea013671f718b70cd6bf497bef9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 3 Sep 2010 17:04:07 +0800 Subject: x86, mm: fix uninitialized addr in kernel_physical_mapping_init() This re-adds the lost chunk in commit 9b861528a80. Reported-by: Stephen Rothwell Signed-off-by: Wu Fengguang Cc: Peter Zijlstra Cc: Haicheng Li Cc: Andi Kleen LKML-Reference: <20100903090407.GA19771@localhost> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 64e7bc2ded93..74f0f358b07b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -570,6 +570,7 @@ kernel_physical_mapping_init(unsigned long start, start = (unsigned long)__va(start); end = (unsigned long)__va(end); + addr = start; for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); -- cgit v1.2.3 From 57ab43e33122ffdc2eebca5d6de035699f0a8c06 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Sep 2010 18:39:39 +0200 Subject: x86, GART: Remove superfluous AMD64_GARTEN There is a GARTEN so use that and drop the duplicate. Signed-off-by: Borislav Petkov Cc: Dave Airlie Cc: FUJITA Tomonori LKML-Reference: <1283531981-7495-2-git-send-email-bp@amd64.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/gart.h | 1 - arch/x86/kernel/aperture_64.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 4ac5b0f33fc1..fba0a72c4cc5 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -27,7 +27,6 @@ extern int fix_aperture; #define AMD64_GARTAPERTUREBASE 0x94 #define AMD64_GARTTABLEBASE 0x98 #define AMD64_GARTCACHECTL 0x9c -#define AMD64_GARTEN (1<<0) #ifdef CONFIG_GART_IOMMU extern int gart_iommu_aperture; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e17..6fabd406aa7f 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - aper_enabled = ctl & AMD64_GARTEN; + aper_enabled = ctl & GARTEN; aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; @@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - ctl &= ~AMD64_GARTEN; + ctl &= ~GARTEN; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); } } -- cgit v1.2.3 From 260133ab658bd2b80e07832a878e00405e19ff43 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Sep 2010 18:39:40 +0200 Subject: x86, GART: Disable GART table walk probes Current code tramples over bit F3x90[6] which can be used to disable GART table walk probes. However, this bit should be set for performance reasons (speed up GART table walks). We are allowed to do that since we put GART tables in UC memory later anyway. Make it so. Signed-off-by: Borislav Petkov Cc: Dave Airlie Cc: FUJITA Tomonori LKML-Reference: <1283531981-7495-3-git-send-email-bp@amd64.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/gart.h | 14 ++++++++++++++ arch/x86/kernel/aperture_64.c | 14 ++++++++------ arch/x86/kernel/pci-gart_64.c | 2 +- 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index fba0a72c4cc5..bf357f9b25f0 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -17,6 +17,7 @@ extern int fix_aperture; #define GARTEN (1<<0) #define DISGARTCPU (1<<4) #define DISGARTIO (1<<5) +#define DISTLBWALKPRB (1<<6) /* GART cache control register bits. */ #define INVGART (1<<0) @@ -56,6 +57,19 @@ static inline void gart_iommu_hole_init(void) extern int agp_amd64_init(void); +static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) +{ + u32 ctl; + + /* + * Don't enable translation but enable GART IO and CPU accesses. + * Also, set DISTLBWALKPRB since GART tables memory is UC. + */ + ctl = DISTLBWALKPRB | order << 1; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6fabd406aa7f..c9cb17368448 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -505,8 +505,13 @@ out: /* Fix up the north bridges */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { - int bus; - int dev_base, dev_limit; + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = DISTLBWALKPRB | aper_order << 1; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -515,10 +520,7 @@ out: if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) continue; - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); } } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa67..6015ee13e22b 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -601,7 +601,7 @@ static void gart_fixup_northbridges(struct sys_device *dev) * Don't enable translations just yet. That is the next * step. Restore the pre-suspend aperture settings. */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + gart_set_size_and_enable(dev, aperture_order); pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } } -- cgit v1.2.3 From d9fadd7ba99a67030783a212bcb17d11f0678433 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 2 Sep 2010 15:37:10 +0200 Subject: x86, AMD: Remove needless CPU family check (for L3 cache info) Old 32-bit AMD CPUs (all w/o L3 cache) should always return 0 for cpuid_edx(0x80000006). For unknown reason the 32-bit implementation differed from the 64-bit implementation. See commit 67cddd94799 ("i386: Add L3 cache support to AMD CPUID4 emulation"). The current check is the result of the x86 merge. Signed-off-by: Andreas Herrmann Cc: Andi Kleen LKML-Reference: <20100902133710.GA5449@loge.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fc563fabde67..0f0ace5d7db5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -540,7 +540,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) #endif if (c->extended_cpuid_level >= 0x80000006) { - if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) + if (cpuid_edx(0x80000006) & 0xf000) num_cache_leaves = 4; else num_cache_leaves = 3; -- cgit v1.2.3 From 592091c0e21655bfbdf68741dd5a920c2ac2bbe6 Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Tue, 31 Aug 2010 09:13:33 +0900 Subject: therm_throt.c: Trivial printk message fix for a unsuitable abbreviation of 'thermal' In unexpected_thermal_interrupt(), "LVT TMR interrupt" is used in error message. I don't think TMR is a suitable abbreviation for thermal. 1.TMR has been used in IA32 Architectures Software Developer's Manual, and is the abbreviation for Trigger Mode Register. 2.There is not an standard abbreviation "TMR" defined for thermal in IA32 Architectures Software Developer's Manual. 3.Though we could understand it as Thermal Monitor Register, it is easy to be misunderstood as a *TIMER* interrupt also. I think this patch will fix it. Signed-off-by: Jin Dongming Reviewed-by: Jean Delvare Cc: Brown Len Cc: Hidetoshi Seto Cc: Fenghua Yu LKML-Reference: <4C7C492D.5020704@np.css.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index c2a8b26d4fea..1d0f743c9d6e 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -348,7 +348,7 @@ static void intel_thermal_interrupt(void) static void unexpected_thermal_interrupt(void) { - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); add_taint(TAINT_MACHINE_CHECK); } -- cgit v1.2.3 From 0f1cf415f00286a38f5ce35b459342dbfc895b50 Mon Sep 17 00:00:00 2001 From: Christian Dietrich Date: Mon, 6 Sep 2010 16:36:18 +0200 Subject: x86: Remove unnecessary #ifdef ACPI/X86_IO_ACPI The ACPI/X86_IO_ACPI ifdef isn't necessary at this point, because it is checked in an outer ifdef level already and has no effect here. Cleanup only, no functional effect. Signed-off-by: Christian Dietrich Acked-by: Borislav Petkov Cc: vamos-dev@i4.informatik.uni-erlangen.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index e5cc7e82e60d..e4bd78c160d8 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func) } -#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { @@ -116,7 +115,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } -#endif static void __init ati_bugs(int num, int slot, int func) { -- cgit v1.2.3 From 7ef8aa72ab176e0288f363d1247079732c5d5792 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Mon, 6 Sep 2010 15:14:17 +0200 Subject: x86, cpu: Fix renamed, not-yet-shipping AMD CPUID feature bit The AMD SSE5 feature set as-it has been replaced by some extensions to the AVX instruction set. Thus the bit formerly advertised as SSE5 is re-used for one of these extensions (XOP). Although this changes the /proc/cpuinfo output, it is not user visible, as there are no CPUs (yet) having this feature. To avoid confusion this should be added to the stable series, too. Cc: stable@kernel.org [.32.x .34.x, .35.x] Signed-off-by: Andre Przywara LKML-Reference: <1283778860-26843-2-git-send-email-andre.przywara@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 781a50b29a49..c9c73d8dedb0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -152,7 +152,7 @@ #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ +#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a09c625d526..dd54779ccbea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1996,7 +1996,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT */ | 0 /* WDT */; /* all calls to cpuid_count() should be made on the same cpu */ -- cgit v1.2.3 From 33ed82fb6c5f032151f7e9f1ac7b667f78f426b8 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Mon, 6 Sep 2010 15:14:18 +0200 Subject: x86, cpu: Update AMD CPUID feature bits AMD's public CPUID specification has been updated and some bits have got names. Add them to properly describe new CPU features. Signed-off-by: Andre Przywara LKML-Reference: <1283778860-26843-3-git-send-email-andre.przywara@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index c9c73d8dedb0..341835df7892 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -155,7 +155,11 @@ #define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ /* * Auxiliary flags: Linux defined - For features scattered in various -- cgit v1.2.3 From 6d886fd042634c0d3312bace63a5d0c541b721dc Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Mon, 6 Sep 2010 15:14:19 +0200 Subject: x86, cpu: Fix allowed CPUID bits for KVM guests The AMD extensions to AVX (FMA4, XOP) work on the same YMM register set as AVX, so they are safe for guests to use, as long as AVX itself is allowed. Add F16C and AES on the way for the same reasons. Signed-off-by: Andre Przywara LKML-Reference: <1283778860-26843-4-git-send-email-andre.przywara@amd.com> Acked-by: Avi Kivity Signed-off-by: H. Peter Anvin --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dd54779ccbea..6c2ecf0a806d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1991,13 +1991,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT */ | 0 /* WDT */; + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.2.3 From aeb9c7d618264dcf6eea39142fefee096c3b09e2 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Mon, 6 Sep 2010 15:14:20 +0200 Subject: x86, kvm: add new AMD SVM feature bits The recently updated CPUID specification names new SVM feature bits. Add them to the list of reported features. Signed-off-by: Andre Przywara LKML-Reference: <1283778860-26843-5-git-send-email-andre.przywara@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 7 +++++++ arch/x86/kernel/cpu/scattered.c | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 341835df7892..bffeab7eab97 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -183,6 +183,13 @@ #define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ #define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ #define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */ + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 34b4dad6f0b8..2c77931473fb 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,6 +43,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 }, + { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 }, + { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 }, + { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, + { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, + { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, { 0, 0, 0, 0, 0 } }; -- cgit v1.2.3 From 37a2f9f30a360fb03522d15c85c78265ccd80287 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 8 Sep 2010 10:14:27 -0500 Subject: x86, kdump: Change copy_oldmem_page() to use cached addressing The copy of /proc/vmcore to a user buffer proceeds much faster if the kernel addresses memory as cached. With this patch we have seen an increase in transfer rate from less than 15MB/s to 80-460MB/s, depending on size of the transfer. This makes a big difference in time needed to save a system dump. Signed-off-by: Cliff Wickman Acked-by: "Eric W. Biederman" Cc: kexec@lists.infradead.org Cc: # as far back as it would apply LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash_dump_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045b36cada65..bf43188ca654 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); if (!vaddr) return -ENOMEM; -- cgit v1.2.3 From 2df7a6e9e8e67c19e5fe2eac3f2d2223b7bb4a7b Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:08 -0400 Subject: x86: Use correct type for %cr4 %cr4 is 64-bit in 64-bit mode (although the upper 32-bits are currently reserved). Use unsigned long for the temporary variable to get the right size. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 325b7bdbebaa..396b80f2d871 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -602,7 +602,7 @@ extern unsigned long mmu_cr4_features; static inline void set_in_cr4(unsigned long mask) { - unsigned cr4; + unsigned long cr4; mmu_cr4_features |= mask; cr4 = read_cr4(); @@ -612,7 +612,7 @@ static inline void set_in_cr4(unsigned long mask) static inline void clear_in_cr4(unsigned long mask) { - unsigned cr4; + unsigned long cr4; mmu_cr4_features &= ~mask; cr4 = read_cr4(); -- cgit v1.2.3 From 6ac8bac2684235f4caf22a410549c582aa7327d6 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:09 -0400 Subject: x86, fpu: Merge fpu_init() Make fpu_init() handle 32-bit setup. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 7 ------- arch/x86/kernel/i387.c | 30 ++++++++++++++---------------- arch/x86/kernel/traps.c | 12 ------------ 3 files changed, 14 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 490dac63c2d2..f9e23e8639eb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1264,13 +1264,6 @@ void __cpuinit cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - /* - * Force FPU initialization: - */ - current_thread_info()->status = 0; - clear_used_math(); - mxcsr_feature_mask_init(); - fpu_init(); xsave_init(); } diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2605c50b11d3..82166519497a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -85,7 +85,6 @@ static void __cpuinit init_thread_xstate(void) #endif } -#ifdef CONFIG_X86_64 /* * Called at bootup to set up the initial FPU state that is later cloned * into all processes. @@ -93,12 +92,21 @@ static void __cpuinit init_thread_xstate(void) void __cpuinit fpu_init(void) { - unsigned long oldcr0 = read_cr0(); - - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); + unsigned long cr0; + unsigned long cr4_mask = 0; - write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + set_in_cr4(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!HAVE_HWFP) + cr0 |= X86_CR0_EM; + write_cr0(cr0); if (!smp_processor_id()) init_thread_xstate(); @@ -109,16 +117,6 @@ void __cpuinit fpu_init(void) clear_used_math(); } -#else /* CONFIG_X86_64 */ - -void __cpuinit fpu_init(void) -{ - if (!smp_processor_id()) - init_thread_xstate(); -} - -#endif /* CONFIG_X86_32 */ - void fpu_finit(struct fpu *fpu) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 60788dee0f8a..d0029eb58589 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -881,18 +881,6 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - set_system_trap_gate(SYSCALL_VECTOR, &system_call); set_bit(SYSCALL_VECTOR, used_vectors); #endif -- cgit v1.2.3 From 51115d4d45700fc7c08306f7ba6e68551f526ae5 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:10 -0400 Subject: x86, fpu: Merge tolerant_fwait() Commit e2e75c91 merged the math exception handler, allowing both 32-bit and 64-bit to handle math exceptions from kernel mode. Switch to using the 64-bit version of tolerant_fwait() without fnclex, which simply ignores the exception if one is still pending from userspace. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a73a8d5a5e69..5d8f9a79fa5d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -77,15 +77,6 @@ static inline void sanitize_i387_state(struct task_struct *tsk) } #ifdef CONFIG_X86_64 - -/* Ignore delayed exceptions from user space */ -static inline void tolerant_fwait(void) -{ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); -} - static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { int err; @@ -220,11 +211,6 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft); static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif -static inline void tolerant_fwait(void) -{ - asm volatile("fnclex ; fwait"); -} - /* perform fxrstor iff the processor has extended states, otherwise frstor */ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { @@ -344,7 +330,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk) static inline void __clear_fpu(struct task_struct *tsk) { if (task_thread_info(tsk)->status & TS_USEDFPU) { - tolerant_fwait(); + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } -- cgit v1.2.3 From bfd946cb891800d408decaae268a3480775178a3 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:11 -0400 Subject: x86, fpu: Merge __save_init_fpu() __save_init_fpu() is identical for 32-bit and 64-bit. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 5d8f9a79fa5d..88065e3a03c0 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -197,12 +197,6 @@ static inline void fpu_save_init(struct fpu *fpu) fpu_clear(fpu); } -static inline void __save_init_fpu(struct task_struct *tsk) -{ - fpu_save_init(&tsk->thread.fpu); - task_thread_info(tsk)->status &= ~TS_USEDFPU; -} - #else /* CONFIG_X86_32 */ #ifdef CONFIG_MATH_EMULATION @@ -285,15 +279,14 @@ end: ; } +#endif /* CONFIG_X86_64 */ + static inline void __save_init_fpu(struct task_struct *tsk) { fpu_save_init(&tsk->thread.fpu); task_thread_info(tsk)->status &= ~TS_USEDFPU; } - -#endif /* CONFIG_X86_64 */ - static inline int fpu_fxrstor_checking(struct fpu *fpu) { return fxrstor_checking(&fpu->state->fxsave); -- cgit v1.2.3 From a4d4fbc7735bba6654b20f859135f9d3f8fe7f76 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:12 -0400 Subject: x86-64, fpu: Disable preemption when using TS_USEDFPU Consolidates code and fixes the below race for 64-bit. commit 9fa2f37bfeb798728241cc4a19578ce6e4258f25 Author: torvalds Date: Tue Sep 2 07:37:25 2003 +0000 Be a lot more careful about TS_USEDFPU and preemption We had some races where we testecd (or set) TS_USEDFPU together with sequences that depended on the setting (like clearing or setting the TS flag in %cr0) and we could be preempted in between, which screws up the FPU state, since preemption will itself change USEDFPU and the TS flag. This makes it a lot more explicit: the "internal" low-level FPU functions ("__xxxx_fpu()") all require preemption to be disabled, and the exported "real" functions will make sure that is the case. One case - in __switch_to() - was switched to the non-preempt-safe internal version, since the scheduler itself has already disabled preemption. BKrev: 3f5448b5WRiQuyzAlbajs3qoQjSobw Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 15 --------------- arch/x86/kernel/process_64.c | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 88065e3a03c0..8b40a8379cc2 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -387,19 +387,6 @@ static inline void irq_ts_restore(int TS_state) stts(); } -#ifdef CONFIG_X86_64 - -static inline void save_init_fpu(struct task_struct *tsk) -{ - __save_init_fpu(tsk); - stts(); -} - -#define unlazy_fpu __unlazy_fpu -#define clear_fpu __clear_fpu - -#else /* CONFIG_X86_32 */ - /* * These disable preemption on their own and are safe */ @@ -425,8 +412,6 @@ static inline void clear_fpu(struct task_struct *tsk) preempt_enable(); } -#endif /* CONFIG_X86_64 */ - /* * i387 state interaction */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d9ea531ddd1..b3d7a3a04f38 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); /* Must be after DS reload */ - unlazy_fpu(prev_p); + __unlazy_fpu(prev_p); /* Make sure cpu is ready for new context */ if (preload_fpu) -- cgit v1.2.3 From 10c11f304986a1f84201c2261a428701f9d2dffc Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:13 -0400 Subject: x86-64, fpu: Fix %cs value in convert_from_fxsr() While %ds still contains the userspace selector, %cs is KERNEL_CS at this point. Always get %cs from pt_regs even for the current task. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-7-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 82166519497a..f3775f5ef4ab 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -389,19 +389,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) #ifdef CONFIG_X86_64 env->fip = fxsave->rip; env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; if (tsk == current) { - /* - * should be actually ds/cs at fpu exception time, but - * that information is not available in 64bit mode. - */ - asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos)); - asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs)); + savesegment(ds, env->fos); } else { - struct pt_regs *regs = task_pt_regs(tsk); - - env->fos = 0xffff0000 | tsk->thread.ds; - env->fcs = regs->cs; + env->fos = tsk->thread.ds; } + env->fos |= 0xffff0000; #else env->fip = fxsave->fip; env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); -- cgit v1.2.3 From 820241356d6aa9a895fc10def15794a5a5bfcd98 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:14 -0400 Subject: x86-64, fpu: Simplify constraints for fxsave/fxtstor Use the "R" constraint (legacy register) instead of listing all the possible registers. Clean up the comments as well. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-8-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 8b40a8379cc2..768fcb25900a 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -81,6 +81,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { int err; + /* See comment in fxsave() below. */ asm volatile("1: rex64/fxrstor (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -89,11 +90,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) -#if 0 /* See comment in fxsave() below. */ - : [fx] "r" (fx), "m" (*fx), "0" (0)); -#else - : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); -#endif + : [fx] "R" (fx), "m" (*fx), "0" (0)); return err; } @@ -140,6 +137,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) if (unlikely(err)) return -EFAULT; + /* See comment in fxsave() below. */ asm volatile("1: rex64/fxsave (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -148,11 +146,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) -#if 0 /* See comment in fxsave() below. */ - : [fx] "r" (fx), "0" (0)); -#else - : [fx] "cdaSDb" (fx), "0" (0)); -#endif + : [fx] "R" (fx), "0" (0)); if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct))) err = -EFAULT; @@ -165,26 +159,22 @@ static inline void fpu_fxsave(struct fpu *fpu) /* Using "rex64; fxsave %0" is broken because, if the memory operand uses any extended registers for addressing, a second REX prefix will be generated (to the assembler, rex64 followed by semicolon - is a separate instruction), and hence the 64-bitness is lost. */ -#if 0 - /* Using "fxsaveq %0" would be the ideal choice, but is only supported - starting with gas 2.16. */ - __asm__ __volatile__("fxsaveq %0" - : "=m" (fpu->state->fxsave)); -#elif 0 - /* Using, as a workaround, the properly prefixed form below isn't + is a separate instruction), and hence the 64-bitness is lost. + Using "fxsaveq %0" would be the ideal choice, but is only supported + starting with gas 2.16. + asm volatile("fxsaveq %0" + : "=m" (fpu->state->fxsave)); + Using, as a workaround, the properly prefixed form below isn't accepted by any binutils version so far released, complaining that the same type of prefix is used twice if an extended register is - needed for addressing (fix submitted to mainline 2005-11-21). */ - __asm__ __volatile__("rex64/fxsave %0" - : "=m" (fpu->state->fxsave)); -#else - /* This, however, we can work around by forcing the compiler to select + needed for addressing (fix submitted to mainline 2005-11-21). + asm volatile("rex64/fxsave %0" + : "=m" (fpu->state->fxsave)); + This, however, we can work around by forcing the compiler to select an addressing mode that doesn't require extended registers. */ - __asm__ __volatile__("rex64/fxsave (%1)" - : "=m" (fpu->state->fxsave) - : "cdaSDb" (&fpu->state->fxsave)); -#endif + asm volatile("rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); } static inline void fpu_save_init(struct fpu *fpu) -- cgit v1.2.3 From a334fe43d85f570ae907acf988a053c5eff78d6e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:15 -0400 Subject: x86-32, fpu: Remove math_emulate stub check_fpu() in bugs.c halts boot if no FPU is found and math emulation isn't enabled. Therefore this stub will never be used. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-9-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/traps.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d0029eb58589..d43968503dd2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void) } EXPORT_SYMBOL_GPL(math_state_restore); -#ifndef CONFIG_MATH_EMULATION -void math_emulate(struct math_emu_info *info) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 +#ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; @@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); + return; } -#else - math_state_restore(); +#endif + math_state_restore(); /* interrupts still off */ +#ifdef CONFIG_X86_32 + conditional_sti(regs); #endif } -- cgit v1.2.3 From 8eb91a577d7763d21628f6761045328784b1911c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:16 -0400 Subject: x86, fpu: Remove unnecessary ifdefs from i387 code. Remove ifdefs for code that the compiler can optimize away on 64-bit. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-10-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 12 ++++++------ arch/x86/kernel/i387.c | 4 ---- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 768fcb25900a..42b507e16d1d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf); extern int restore_i387_xstate_ia32(void __user *buf); #endif +#ifdef CONFIG_MATH_EMULATION +extern void finit_soft_fpu(struct i387_soft_struct *soft); +#else +static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} +#endif + #define X87_FSW_ES (1 << 7) /* Exception Summary */ static __always_inline __pure bool use_xsaveopt(void) @@ -189,12 +195,6 @@ static inline void fpu_save_init(struct fpu *fpu) #else /* CONFIG_X86_32 */ -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - /* perform fxrstor iff the processor has extended states, otherwise frstor */ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f3775f5ef4ab..e795e360bd50 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -79,10 +79,8 @@ static void __cpuinit init_thread_xstate(void) if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); -#ifdef CONFIG_X86_32 else xstate_size = sizeof(struct i387_fsave_struct); -#endif } /* @@ -119,12 +117,10 @@ void __cpuinit fpu_init(void) void fpu_finit(struct fpu *fpu) { -#ifdef CONFIG_X86_32 if (!HAVE_HWFP) { finit_soft_fpu(&fpu->state->soft); return; } -#endif if (cpu_has_fxsr) { struct i387_fxsave_struct *fx = &fpu->state->fxsave; -- cgit v1.2.3 From eec73f813ab0954253e5e2168119c4555f83f07d Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:17 -0400 Subject: x86, fpu: Remove PSHUFB_XMM5_* macros The PSHUFB_XMM5_* macros are no longer used. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-11-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 42b507e16d1d..907967e4fa8d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -465,7 +465,4 @@ extern void fpu_finit(struct fpu *fpu); #endif /* __ASSEMBLY__ */ -#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 -#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 - #endif /* _ASM_X86_I387_H */ -- cgit v1.2.3 From 58a992b9cbaf449aeebd3575c3695a9eb5d95b5e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:18 -0400 Subject: x86-32, fpu: Rewrite fpu_save_init() Rewrite fpu_save_init() to prepare for merging with 64-bit. Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-12-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 47 +++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 907967e4fa8d..b45abefb89f2 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -73,6 +73,11 @@ static __always_inline __pure bool use_xsave(void) return static_cpu_has(X86_FEATURE_XSAVE); } +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has(X86_FEATURE_FXSR); +} + extern void __sanitize_i387_state(struct task_struct *); static inline void sanitize_i387_state(struct task_struct *tsk) @@ -211,6 +216,12 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) return 0; } +static inline void fpu_fxsave(struct fpu *fpu) +{ + asm volatile("fxsave %[fx]" + : [fx] "=m" (fpu->state->fxsave)); +} + /* We need a safe address that is cheap to find and that is already in L1 during context switch. The best choices are unfortunately different for UP and SMP */ @@ -226,36 +237,24 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) static inline void fpu_save_init(struct fpu *fpu) { if (use_xsave()) { - struct xsave_struct *xstate = &fpu->state->xsave; - struct i387_fxsave_struct *fx = &fpu->state->fxsave; - fpu_xsave(fpu); /* * xsave header may indicate the init state of the FP. */ - if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) - goto end; - - if (unlikely(fx->swd & X87_FSW_ES)) - asm volatile("fnclex"); - - /* - * we can do a simple return here or be paranoid :) - */ - goto clear_state; + if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) + return; + } else if (use_fxsr()) { + fpu_fxsave(fpu); + } else { + asm volatile("fsave %[fx]; fwait" + : [fx] "=m" (fpu->state->fsave)); + return; } - /* Use more nops than strictly needed in case the compiler - varies code */ - alternative_input( - "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, - "fxsave %[fx]\n" - "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", - X86_FEATURE_FXSR, - [fx] "m" (fpu->state->fxsave), - [fsw] "m" (fpu->state->fxsave.swd) : "memory"); -clear_state: + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) + asm volatile("fnclex"); + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. safe_address is a random variable that should be in L1 */ @@ -265,8 +264,6 @@ clear_state: "fildl %[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, [addr] "m" (safe_address)); -end: - ; } #endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From b2b57fe053c9cf8b8af5a0e826a465996afed0ff Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 3 Sep 2010 21:17:19 -0400 Subject: x86, fpu: Merge fpu_save_init() Make 64-bit use the 32-bit version of fpu_save_init(). Remove unused clear_fpu_state(). Signed-off-by: Brian Gerst Acked-by: Pekka Enberg Cc: Suresh Siddha LKML-Reference: <1283563039-3466-13-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 48 ++++----------------------------------------- 1 file changed, 4 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index b45abefb89f2..70626ed96cb5 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -105,36 +105,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) return err; } -/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. The kernel data segment can be sometimes 0 and sometimes - new user value. Both should be ok. - Use the PDA as safe address because it should be already in L1. */ -static inline void fpu_clear(struct fpu *fpu) -{ - struct xsave_struct *xstate = &fpu->state->xsave; - struct i387_fxsave_struct *fx = &fpu->state->fxsave; - - /* - * xsave header may indicate the init state of the FP. - */ - if (use_xsave() && - !(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) - return; - - if (unlikely(fx->swd & X87_FSW_ES)) - asm volatile("fnclex"); - alternative_input(ASM_NOP8 ASM_NOP2, - " emms\n" /* clear stack tags */ - " fildl %%gs:0", /* load to clear state */ - X86_FEATURE_FXSAVE_LEAK); -} - -static inline void clear_fpu_state(struct task_struct *tsk) -{ - fpu_clear(&tsk->thread.fpu); -} - static inline int fxsave_user(struct i387_fxsave_struct __user *fx) { int err; @@ -188,16 +158,6 @@ static inline void fpu_fxsave(struct fpu *fpu) : [fx] "R" (&fpu->state->fxsave)); } -static inline void fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) - fpu_xsave(fpu); - else - fpu_fxsave(fpu); - - fpu_clear(fpu); -} - #else /* CONFIG_X86_32 */ /* perform fxrstor iff the processor has extended states, otherwise frstor */ @@ -222,6 +182,8 @@ static inline void fpu_fxsave(struct fpu *fpu) : [fx] "=m" (fpu->state->fxsave)); } +#endif /* CONFIG_X86_64 */ + /* We need a safe address that is cheap to find and that is already in L1 during context switch. The best choices are unfortunately different for UP and SMP */ @@ -259,15 +221,13 @@ static inline void fpu_save_init(struct fpu *fpu) is pending. Clear the x87 state here by setting it to fixed values. safe_address is a random variable that should be in L1 */ alternative_input( - GENERIC_NOP8 GENERIC_NOP2, + ASM_NOP8 ASM_NOP2, "emms\n\t" /* clear stack tags */ - "fildl %[addr]", /* set F?P to defined value */ + "fildl %P[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, [addr] "m" (safe_address)); } -#endif /* CONFIG_X86_64 */ - static inline void __save_init_fpu(struct task_struct *tsk) { fpu_save_init(&tsk->thread.fpu); -- cgit v1.2.3 From a7f07cfbaa1dd5bf9e615948f280c92e7928e6f7 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Sep 2010 15:55:49 -0700 Subject: x86, mtrr: Refactor MTRR type overlap check code Move the MTRR type overlap check into a new function. No functional change in this patch. Just making it easier to add multiple region overlap check in the following patch. Signed-off-by: Venkatesh Pallipadi LKML-Reference: <1284159350-19841-2-git-send-email-venki@google.com> Reviewed-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 44 +++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d28d7d03885..14f4f0c0329a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -64,6 +64,33 @@ static inline void k8_check_syscfg_dram_mod_en(void) } } +/* + * Check and return the effective type for MTRR-MTRR type overlap. + * Returns 1 if the effective type is UNCACHEABLE, else returns 0 + */ +static int check_type_overlap(u8 *prev, u8 *curr) +{ + if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) || + (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) { + *prev = MTRR_TYPE_WRTHROUGH; + *curr = MTRR_TYPE_WRTHROUGH; + } + + if (*prev != *curr) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + return 0; +} + /* * Returns the effective MTRR type for the region * Error returns: @@ -138,21 +165,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) continue; } - if (prev_match == MTRR_TYPE_UNCACHABLE || - curr_match == MTRR_TYPE_UNCACHABLE) { - return MTRR_TYPE_UNCACHABLE; - } - - if ((prev_match == MTRR_TYPE_WRBACK && - curr_match == MTRR_TYPE_WRTHROUGH) || - (prev_match == MTRR_TYPE_WRTHROUGH && - curr_match == MTRR_TYPE_WRBACK)) { - prev_match = MTRR_TYPE_WRTHROUGH; - curr_match = MTRR_TYPE_WRTHROUGH; - } - - if (prev_match != curr_match) - return MTRR_TYPE_UNCACHABLE; + if (check_type_overlap(&prev_match, &curr_match)) + return curr_match; } if (mtrr_tom2) { -- cgit v1.2.3 From 351e5a703ad994405bd900da330823d3b4a372e0 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Sep 2010 15:55:50 -0700 Subject: x86, mtrr: Support mtrr lookup for range spanning across MTRR range mtrr_type_lookup [start:end] looked up the resultant MTRR type for that range, based on fixed and all variable MTRR ranges. It did check for multiple MTRR var ranges overlapping [start:end] and returned the net type. However, if the [start:end] range spanned across any var MTRR range, mtrr_type_lookup would return an error return of 0xFE. This was based on typical usage of mtrr_type_lookup in PAT mapping, where region being mapped would not normally span across MTRR ranges and also trying to keep the code simple. Mark recently reported the problem with this limitation. When there are two continguous MTRR's of type "writeback" and if there is a memory mapping over a region starting in one MTRR range and ending in another MTRR range, such mapping will fallback to "uncached" due to the above limitation. Change below adds support for such lookups spanning multiple MTRR ranges. We now have a wrapper mtrr_type_lookup that dynamically splits such a region into smaller chunks that fit within one MTRR range and does a __mtrr_type_lookup on it and combine the results later. Reported-by: Mark Langsdorf Signed-off-by: Venkatesh Pallipadi LKML-Reference: <1284159350-19841-3-git-send-email-venki@google.com> Reviewed-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 84 ++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 14f4f0c0329a..9f27228ceffd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -64,6 +64,18 @@ static inline void k8_check_syscfg_dram_mod_en(void) } } +/* Get the size of contiguous MTRR range */ +static u64 get_mtrr_size(u64 mask) +{ + u64 size; + + mask >>= PAGE_SHIFT; + mask |= size_or_mask; + size = -mask; + size <<= PAGE_SHIFT; + return size; +} + /* * Check and return the effective type for MTRR-MTRR type overlap. * Returns 1 if the effective type is UNCACHEABLE, else returns 0 @@ -92,17 +104,19 @@ static int check_type_overlap(u8 *prev, u8 *curr) } /* - * Returns the effective MTRR type for the region - * Error returns: - * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR - * - 0xFF - when MTRR is not enabled + * Error/Semi-error returns: + * 0xFF - when MTRR is not enabled + * *repeat == 1 implies [start:end] spanned across MTRR range and type returned + * corresponds only to [start:*partial_end]. + * Caller has to lookup again for [*partial_end:end]. */ -u8 mtrr_type_lookup(u64 start, u64 end) +static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) { int i; u64 base, mask; u8 prev_match, curr_match; + *repeat = 0; if (!mtrr_state_set) return 0xFF; @@ -153,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); - if (start_state != end_state) - return 0xFE; + + if (start_state != end_state) { + /* + * We have start:end spanning across an MTRR. + * We split the region into + * either + * (start:mtrr_end) (mtrr_end:end) + * or + * (start:mtrr_start) (mtrr_start:end) + * depending on kind of overlap. + * Return the type for first region and a pointer to + * the start of second region so that caller will + * lookup again on the second region. + * Note: This way we handle multiple overlaps as well. + */ + if (start_state) + *partial_end = base + get_mtrr_size(mask); + else + *partial_end = base; + + if (unlikely(*partial_end <= start)) { + WARN_ON(1); + *partial_end = start + PAGE_SIZE; + } + + end = *partial_end - 1; /* end is inclusive */ + *repeat = 1; + } if ((start & mask) != (base & mask)) continue; @@ -180,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end) return mtrr_state.def_type; } +/* + * Returns the effective MTRR type for the region + * Error return: + * 0xFF - when MTRR is not enabled + */ +u8 mtrr_type_lookup(u64 start, u64 end) +{ + u8 type, prev_type; + int repeat; + u64 partial_end; + + type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + + /* + * Common path is with repeat = 0. + * However, we can have cases where [start:end] spans across some + * MTRR range. Do repeated lookups for that case here. + */ + while (repeat) { + prev_type = type; + start = partial_end; + type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + + if (check_type_overlap(&prev_type, &type)) + return type; + } + + return type; +} + /* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) -- cgit v1.2.3 From d0ed0c32662e756e7daf85e70a5a27a9c1111331 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 11 Sep 2010 22:10:54 -0700 Subject: x86: Remove pr_ uses of KERN_ Signed-off-by: Joe Perches Cc: Jiri Kosina LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apb_timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 8dd77800ff5d..4004417a6b53 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -398,7 +398,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, } break; default: - pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + pr_debug("APBT notified %lu, no action\n", action); } return NOTIFY_OK; } @@ -552,7 +552,7 @@ bad_count: pr_debug("APB CS going back %lx:%lx:%lx ", t2, last_read, t2 - last_read); bad_count_x3: - pr_debug(KERN_INFO "tripple check enforced\n"); + pr_debug("triple check enforced\n"); t0 = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); udelay(1); -- cgit v1.2.3 From 75e3cfbed6f71a8f151dc6e413b6ce3c390030cb Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 27 Aug 2010 11:09:48 -0700 Subject: x86, intr-remap: Set redirection hint in the IRTE Currently the redirection hint in the interrupt-remapping table entry is set to 0, which means the remapped interrupt is directed to the processors listed in the destination. So in logical flat mode in the presence of intr-remapping, this results in a single interrupt multi-casted to multiple cpu's as specified by the destination bit mask. But what we really want is to send that interrupt to one of the cpus based on the lowest priority delivery mode. Set the redirection hint in the IRTE to '1' to indicate that we want the remapped interrupt to be directed to only one of the processors listed in the destination. This fixes the issue of same interrupt getting delivered to multiple cpu's in the logical flat mode in the presence of interrupt-remapping. While there is no functional issue observed with this behavior, this will impact performance of such configurations (<=8 cpu's using logical flat mode in the presence of interrupt-remapping) Signed-off-by: Suresh Siddha LKML-Reference: <20100827181049.013051492@sbsiddha-MOBL3.sc.intel.com> Cc: Weidong Han Cc: # [v2.6.32+] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f1efebaf5510..90f8a75f548f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1392,6 +1392,7 @@ int setup_ioapic_entry(int apic_id, int irq, irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = vector; irte.dest_id = IRTE_DEST(destination); + irte.redir_hint = 1; /* Set source-id of interrupt request */ set_ioapic_sid(&irte, apic_id); @@ -3343,6 +3344,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); + irte.redir_hint = 1; /* Set source-id of interrupt request */ if (pdev) -- cgit v1.2.3 From 62a92f4c69cd1d9361ad8c16be1dd16e6821bc15 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 27 Aug 2010 11:09:49 -0700 Subject: x86, intr-remap: Remove IRTE setup duplicate code Remove IRTE setup duplicate code with prepare_irte(). Signed-off-by: Suresh Siddha LKML-Reference: <20100827181049.095067319@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_remapping.h | 27 +++++++++++++++++++++++++++ arch/x86/kernel/apic/io_apic.c | 27 ++------------------------- 2 files changed, 29 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index f275e2244505..8d841505344e 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -3,4 +3,31 @@ #define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) +#ifdef CONFIG_INTR_REMAP +static inline void prepare_irte(struct irte *irte, int vector, + unsigned int dest) +{ + memset(irte, 0, sizeof(*irte)); + + irte->present = 1; + irte->dst_mode = apic->irq_dest_mode; + /* + * Trigger mode in the IRTE will always be edge, and for IO-APIC, the + * actual level or edge trigger will be setup in the IO-APIC + * RTE. This will help simplify level triggered irq migration. + * For more details, see the comments (in io_apic.c) explainig IO-APIC + * irq migration in the presence of interrupt-remapping. + */ + irte->trigger_mode = 0; + irte->dlvry_mode = apic->irq_delivery_mode; + irte->vector = vector; + irte->dest_id = IRTE_DEST(dest); + irte->redir_hint = 1; +} +#else +static void prepare_irte(struct irte *irte, int vector, unsigned int dest) +{ +} +#endif + #endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 90f8a75f548f..e8c95a22614a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1377,22 +1377,7 @@ int setup_ioapic_entry(int apic_id, int irq, if (index < 0) panic("Failed to allocate IRTE for ioapic %d\n", apic_id); - memset(&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = apic->irq_dest_mode; - /* - * Trigger mode in the IRTE will always be edge, and the - * actual level or edge trigger will be setup in the IO-APIC - * RTE. This will help simplify level triggered irq migration. - * For more details, see the comments above explainig IO-APIC - * irq migration in the presence of interrupt-remapping. - */ - irte.trigger_mode = 0; - irte.dlvry_mode = apic->irq_delivery_mode; - irte.vector = vector; - irte.dest_id = IRTE_DEST(destination); - irte.redir_hint = 1; + prepare_irte(&irte, vector, destination); /* Set source-id of interrupt request */ set_ioapic_sid(&irte, apic_id); @@ -3336,15 +3321,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, ir_index = map_irq_to_irte_handle(irq, &sub_handle); BUG_ON(ir_index == -1); - memset (&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = apic->irq_dest_mode; - irte.trigger_mode = 0; /* edge */ - irte.dlvry_mode = apic->irq_delivery_mode; - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - irte.redir_hint = 1; + prepare_irte(&irte, cfg->vector, dest); /* Set source-id of interrupt request */ if (pdev) -- cgit v1.2.3 From fa47f7e52874683a9659df2f1f143105f676dc0f Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 27 Aug 2010 11:09:50 -0700 Subject: x86, x2apic: Simplify apic init in SMP and UP builds Move enable_IR_x2apic() inside the default_setup_apic_routing(), and for SMP platforms, move the default_setup_apic_routing() after smp_sanity_check(). This cleans up the code that tries to avoid multiple calls to default_setup_apic_routing() when smp_sanity_check() fails (which goes through the APIC_init_uniprocessor() path). Signed-off-by: Suresh Siddha LKML-Reference: <20100827181049.173087246@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 3 --- arch/x86/kernel/apic/probe_64.c | 3 +++ arch/x86/kernel/smpboot.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e3b534cda49a..8cf86fb3b4e3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1665,10 +1665,7 @@ int __init APIC_init_uniprocessor(void) } #endif -#ifndef CONFIG_SMP - enable_IR_x2apic(); default_setup_apic_routing(); -#endif verify_local_APIC(); connect_bsp_APIC(); diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 83e9be4778e2..f9e4e6a54073 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) */ void __init default_setup_apic_routing(void) { + + enable_IR_x2apic(); + #ifdef CONFIG_X86_X2APIC if (x2apic_mode #ifdef CONFIG_X86_UV diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8b3bfc4dd708..87a8c6b00f8d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1109,8 +1109,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); - enable_IR_x2apic(); - default_setup_apic_routing(); if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1118,6 +1116,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) goto out; } + default_setup_apic_routing(); + preempt_disable(); if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", -- cgit v1.2.3 From 3ee48b6af49cf534ca2f481ecc484b156a41451d Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 16 Sep 2010 11:44:02 -0500 Subject: mm, x86: Saving vmcore with non-lazy freeing of vmas During the reading of /proc/vmcore the kernel is doing ioremap()/iounmap() repeatedly. And the buildup of un-flushed vm_area_struct's is causing a great deal of overhead. (rb_next() is chewing up most of that time). This solution is to provide function set_iounmap_nonlazy(). It causes a subsequent call to iounmap() to immediately purge the vma area (with try_purge_vmap_area_lazy()). With this patch we have seen the time for writing a 250MB compressed dump drop from 71 seconds to 44 seconds. Signed-off-by: Cliff Wickman Cc: Andrew Morton Cc: kexec@lists.infradead.org Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 1 + arch/x86/kernel/crash_dump_64.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 30a3e9776123..6a45ec41ec26 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); +extern void set_iounmap_nonlazy(void); #ifdef __KERNEL__ diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index bf43188ca654..994828899e09 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, } else memcpy(buf, vaddr + offset, csize); + set_iounmap_nonlazy(); iounmap(vaddr); return csize; } -- cgit v1.2.3 From 3518dd14ca888085797ca8d3a9e11c8ef9e7ae68 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 17 Sep 2010 18:07:45 +0200 Subject: x86, cacheinfo: Fix dependency of AMD L3 CID L3 cache index disable code uses PCI accesses to AMD northbridge functions. Currently the code is #ifdef CONFIG_CPU_SUP_AMD. But it should be #if (defined(CONFIG_CPU_SUP_AMD) && defined(CONFIG_PCI)) which in the end is a dependency to K8_NB. Signed-off-by: Andreas Herrmann LKML-Reference: <20100917160744.GF4958@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 898c2f4eab88..2521cdcb877e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -306,7 +306,7 @@ struct _cache_attr { ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); }; -#ifdef CONFIG_CPU_SUP_AMD +#ifdef CONFIG_K8_NB /* * L3 cache descriptors @@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -#else /* CONFIG_CPU_SUP_AMD */ +#else /* CONFIG_K8_NB */ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; -#endif /* CONFIG_CPU_SUP_AMD */ +#endif /* CONFIG_K8_NB */ static int __cpuinit cpuid4_cache_lookup_regs(int index, @@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = { static struct attribute *default_l3_attrs[] = { DEFAULT_SYSFS_CACHE_ATTRS, -#ifdef CONFIG_CPU_SUP_AMD +#ifdef CONFIG_K8_NB &cache_disable_0.attr, &cache_disable_1.attr, #endif -- cgit v1.2.3 From 900f9ac9f12dc3dd6fc8e33e16df172eafcaead6 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 17 Sep 2010 18:02:54 +0200 Subject: x86, k8-gart: Decouple handling of garts and northbridges So far we only provide num_k8_northbridges. This is required in different areas (e.g. L3 cache index disable, GART). But not all AMD CPUs provide a GART. Thus it is useful to split off the GART handling from the generic caching of AMD northbridge misc devices. Signed-off-by: Andreas Herrmann LKML-Reference: <20100917160254.GC4958@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/k8.h | 13 +++++---- arch/x86/kernel/cpu/intel_cacheinfo.c | 4 +-- arch/x86/kernel/k8.c | 52 ++++++++++++++++++++--------------- arch/x86/kernel/pci-gart_64.c | 27 ++++++++++++------ 4 files changed, 58 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index af00bd1d2089..9cee145dcace 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[]; struct bootnode; extern int early_is_k8_nb(u32 value); -extern struct pci_dev **k8_northbridges; -extern int num_k8_northbridges; extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); extern int k8_get_nodes(struct bootnode *nodes); extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); extern int k8_scan_nodes(void); +struct k8_northbridge_info { + u16 num; + u8 gart_supported; + struct pci_dev **nb_misc; +}; +extern struct k8_northbridge_info k8_northbridges; + #ifdef CONFIG_K8_NB -extern int num_k8_northbridges; static inline struct pci_dev *node_to_k8_nb_misc(int node) { - return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; + return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; } #else -#define num_k8_northbridges 0 static inline struct pci_dev *node_to_k8_nb_misc(int node) { diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 2521cdcb877e..6fdfb0b20f8c 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, return; /* not in virtualized environments */ - if (num_k8_northbridges == 0) + if (k8_northbridges.num == 0) return; /* @@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); + int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index 0f7bc20cfcde..5de1b6b39639 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -10,9 +10,6 @@ #include #include -int num_k8_northbridges; -EXPORT_SYMBOL(num_k8_northbridges); - static u32 *flush_words; struct pci_device_id k8_nb_ids[] = { @@ -22,7 +19,7 @@ struct pci_device_id k8_nb_ids[] = { }; EXPORT_SYMBOL(k8_nb_ids); -struct pci_dev **k8_northbridges; +struct k8_northbridge_info k8_northbridges; EXPORT_SYMBOL(k8_northbridges); static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) @@ -40,36 +37,44 @@ int cache_k8_northbridges(void) int i; struct pci_dev *dev; - if (num_k8_northbridges) + if (k8_northbridges.num) return 0; dev = NULL; while ((dev = next_k8_northbridge(dev)) != NULL) - num_k8_northbridges++; + k8_northbridges.num++; + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) + k8_northbridges.gart_supported = 1; - k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), - GFP_KERNEL); - if (!k8_northbridges) + k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * + sizeof(void *), GFP_KERNEL); + if (!k8_northbridges.nb_misc) return -ENOMEM; - if (!num_k8_northbridges) { - k8_northbridges[0] = NULL; + if (!k8_northbridges.num) { + k8_northbridges.nb_misc[0] = NULL; return 0; } - flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges); - return -ENOMEM; + if (k8_northbridges.gart_supported) { + flush_words = kmalloc(k8_northbridges.num * sizeof(u32), + GFP_KERNEL); + if (!flush_words) { + kfree(k8_northbridges.nb_misc); + return -ENOMEM; + } } dev = NULL; i = 0; while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges[i] = dev; - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); + k8_northbridges.nb_misc[i] = dev; + if (k8_northbridges.gart_supported) + pci_read_config_dword(dev, 0x9c, &flush_words[i++]); } - k8_northbridges[i] = NULL; + k8_northbridges.nb_misc[i] = NULL; return 0; } EXPORT_SYMBOL_GPL(cache_k8_northbridges); @@ -93,22 +98,25 @@ void k8_flush_garts(void) unsigned long flags; static DEFINE_SPINLOCK(gart_lock); + if (!k8_northbridges.gart_supported) + return; + /* Avoid races between AGP and IOMMU. In theory it's not needed but I'm not sure if the hardware won't lose flush requests when another is pending. This whole thing is so expensive anyways that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < num_k8_northbridges; i++) { - pci_write_config_dword(k8_northbridges[i], 0x9c, + for (i = 0; i < k8_northbridges.num; i++) { + pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, flush_words[i]|1); flushed++; } - for (i = 0; i < num_k8_northbridges; i++) { + for (i = 0; i < k8_northbridges.num; i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges[i], + pci_read_config_dword(k8_northbridges.nb_misc[i], 0x9c, &w); if (!(w & 1)) break; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa67..8f214a2643fa 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -560,8 +560,11 @@ static void enable_gart_translations(void) { int i; - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + if (!k8_northbridges.gart_supported) + return; + + for (i = 0; i < k8_northbridges.num; i++) { + struct pci_dev *dev = k8_northbridges.nb_misc[i]; enable_gart_translation(dev, __pa(agp_gatt_table)); } @@ -592,10 +595,13 @@ static void gart_fixup_northbridges(struct sys_device *dev) if (!fix_up_north_bridges) return; + if (!k8_northbridges.gart_supported) + return; + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < k8_northbridges.num; i++) { + struct pci_dev *dev = k8_northbridges.nb_misc[i]; /* * Don't enable translations just yet. That is the next @@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < num_k8_northbridges; i++) { - dev = k8_northbridges[i]; + for (i = 0; i < k8_northbridges.num; i++) { + dev = k8_northbridges.nb_misc[i]; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - for (i = 0; i < num_k8_northbridges; i++) { + if (!k8_northbridges.gart_supported) + return; + + for (i = 0; i < k8_northbridges.num; i++) { u32 ctl; - dev = k8_northbridges[i]; + dev = k8_northbridges.nb_misc[i]; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -739,7 +748,7 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (num_k8_northbridges == 0) + if (!k8_northbridges.gart_supported) return 0; #ifndef CONFIG_AGP_AMD64 -- cgit v1.2.3 From bc83cccc761953f878088cdfa682de0970b5561f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 17 Sep 2010 15:36:40 -0700 Subject: x86, mwait: Move mwait constants to a common header file We have MWAIT constants spread across three different .c files, for no good reason. Move them all into a common header file. Signed-off-by: H. Peter Anvin Reviewed-by: Arjan van de Ven Cc: Len Brown LKML-Reference: --- arch/x86/include/asm/mwait.h | 15 +++++++++++++++ arch/x86/kernel/acpi/cstate.c | 11 +---------- 2 files changed, 16 insertions(+), 10 deletions(-) create mode 100644 arch/x86/include/asm/mwait.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h new file mode 100644 index 000000000000..bcdff997668c --- /dev/null +++ b/arch/x86/include/asm/mwait.h @@ -0,0 +1,15 @@ +#ifndef _ASM_X86_MWAIT_H +#define _ASM_X86_MWAIT_H + +#define MWAIT_SUBSTATE_MASK 0xf +#define MWAIT_CSTATE_MASK 0xf +#define MWAIT_SUBSTATE_SIZE 4 +#define MWAIT_MAX_NUM_CSTATES 8 + +#define CPUID_MWAIT_LEAF 5 +#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 +#define CPUID5_ECX_INTERRUPT_BREAK 0x2 + +#define MWAIT_ECX_INTERRUPT_BREAK 0x1 + +#endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index fb7a5f052e2b..bcc4adda7b9b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -13,6 +13,7 @@ #include #include +#include /* * Initialize bm_flags based on the CPU cache properties @@ -65,16 +66,6 @@ static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */ static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; -#define MWAIT_SUBSTATE_MASK (0xf) -#define MWAIT_CSTATE_MASK (0xf) -#define MWAIT_SUBSTATE_SIZE (4) - -#define CPUID_MWAIT_LEAF (5) -#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) -#define CPUID5_ECX_INTERRUPT_BREAK (0x2) - -#define MWAIT_ECX_INTERRUPT_BREAK (0x1) - #define NATIVE_CSTATE_BEYOND_HALT (2) static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) -- cgit v1.2.3 From ea53069231f9317062910d6e772cca4ce93de8c8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 17 Sep 2010 15:39:11 -0700 Subject: x86, hotplug: Use mwait to offline a processor, fix the legacy case The code in native_play_dead() has a number of problems: 1. We should use MWAIT when available, to put ourselves into a deeper sleep state. 2. We use the existence of CLFLUSH to determine if WBINVD is safe, but that is totally bogus -- WBINVD is 486+, whereas CLFLUSH is a much later addition. 3. We should do WBINVD inside the loop, just in case of something like setting an A bit on page tables. Pointed out by Arjan van de Ven. This code is based in part of a previous patch by Venki Pallipadi, but unlike that patch this one keeps all the detection code local instead of pre-caching a bunch of information. We're shutting down the CPU; there is absolutely no hurry. This patch moves all the code to C and deletes the global wbinvd_halt() which is broken anyway. Originally-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin Reviewed-by: Arjan van de Ven Cc: Len Brown Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <20090522232230.162239000@intel.com> --- arch/x86/include/asm/processor.h | 23 --------------- arch/x86/kernel/smpboot.c | 63 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 325b7bdbebaa..f358241e4121 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -764,29 +764,6 @@ extern unsigned long idle_halt; extern unsigned long idle_nomwait; extern bool c1e_detected; -/* - * on systems with caches, caches must be flashed as the absolute - * last instruction before going into a suspended halt. Otherwise, - * dirty data can linger in the cache and become stale on resume, - * leading to strange errors. - * - * perform a variety of operations to guarantee that the compiler - * will not reorder instructions. wbinvd itself is serializing - * so the processor will not reorder. - * - * Systems without cache can just go into halt. - */ -static inline void wbinvd_halt(void) -{ - mb(); - /* check for clflush to determine if wbinvd is legal */ - if (cpu_has_clflush) - asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); - else - while (1) - halt(); -} - extern void enable_sep_cpu(void); extern int sysenter_setup(void); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8b3bfc4dd708..07bf4233441d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -1383,11 +1384,71 @@ void play_dead_common(void) local_irq_disable(); } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + + if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) + return; + if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return; + + eax = CPUID_MWAIT_LEAF; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + while (1) { + mb(); + wbinvd(); + __monitor(¤t_thread_info()->flags, 0, 0); + mb(); + __mwait(eax, 0); + } +} + +static inline void hlt_play_dead(void) +{ + while (1) { + mb(); + if (current_cpu_data.x86 >= 4) + wbinvd(); + mb(); + native_halt(); + } +} + void native_play_dead(void) { play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - wbinvd_halt(); + + mwait_play_dead(); /* Only returns on failure */ + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From a68e5c94f7d3dd64fef34dd5d97e365cae4bb42a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 17 Sep 2010 17:06:46 -0700 Subject: x86, hotplug: Move WBINVD back outside the play_dead loop On processors with hyperthreading, when only one thread is offlined the other thread can cause a spurious wakeup on the idled thread. We do not want to re-WBINVD when that happens. Ideally, we should simply skip WBINVD unless we're the last thread on a particular core to shut down, but there might be similar issues elsewhere in the system. Thus, revert to previous behavior of only WBINVD outside the loop. Partly as a result, remove the mb()'s around it: they are not necessary since wbinvd() is a serializing instruction, but they were intended to make sure the compiler didn't do any funny loop optimizations. Reported-by: Asit Mallick Signed-off-by: H. Peter Anvin Cc: Arjan van de Ven Cc: Len Brown Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: --- arch/x86/kernel/smpboot.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 07bf4233441d..55c80ffb8719 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1422,9 +1422,9 @@ static inline void mwait_play_dead(void) (highest_subcstate - 1); } + wbinvd(); + while (1) { - mb(); - wbinvd(); __monitor(¤t_thread_info()->flags, 0, 0); mb(); __mwait(eax, 0); @@ -1433,11 +1433,10 @@ static inline void mwait_play_dead(void) static inline void hlt_play_dead(void) { + if (current_cpu_data.x86 >= 4) + wbinvd(); + while (1) { - mb(); - if (current_cpu_data.x86 >= 4) - wbinvd(); - mb(); native_halt(); } } -- cgit v1.2.3 From 23ac4ae827e6264e21b898f2cd3f601450aa02a6 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 17 Sep 2010 18:03:43 +0200 Subject: x86, k8: Rename k8.[ch] to amd_nb.[ch] and CONFIG_K8_NB to CONFIG_AMD_NB The file names are somehow misleading as the code is not specific to AMD K8 CPUs anymore. The files accomodate code for other AMD CPU northbridges as well. Same is true for the config option which is valid for AMD CPU northbridges in general and not specific to K8. Signed-off-by: Andreas Herrmann LKML-Reference: <20100917160343.GD4958@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 +- arch/x86/include/asm/amd_nb.h | 39 +++++++++ arch/x86/include/asm/k8.h | 39 --------- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/amd_nb.c | 145 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/aperture_64.c | 2 +- arch/x86/kernel/cpu/intel_cacheinfo.c | 10 +-- arch/x86/kernel/k8.c | 145 ---------------------------------- arch/x86/kernel/pci-gart_64.c | 2 +- arch/x86/kernel/setup.c | 2 +- arch/x86/mm/k8topology_64.c | 2 +- arch/x86/mm/numa_64.c | 2 +- 12 files changed, 197 insertions(+), 197 deletions(-) create mode 100644 arch/x86/include/asm/amd_nb.h delete mode 100644 arch/x86/include/asm/k8.h create mode 100644 arch/x86/kernel/amd_nb.c delete mode 100644 arch/x86/kernel/k8.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..7fd41f0d7542 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -670,7 +670,7 @@ config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI && K8_NB + depends on X86_64 && PCI && AMD_NB ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -2076,7 +2076,7 @@ config OLPC_OPENFIRMWARE endif # X86_32 -config K8_NB +config AMD_NB def_bool y depends on CPU_SUP_AMD && PCI diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h new file mode 100644 index 000000000000..c8517f81b21e --- /dev/null +++ b/arch/x86/include/asm/amd_nb.h @@ -0,0 +1,39 @@ +#ifndef _ASM_X86_AMD_NB_H +#define _ASM_X86_AMD_NB_H + +#include + +extern struct pci_device_id k8_nb_ids[]; +struct bootnode; + +extern int early_is_k8_nb(u32 value); +extern int cache_k8_northbridges(void); +extern void k8_flush_garts(void); +extern int k8_get_nodes(struct bootnode *nodes); +extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int k8_scan_nodes(void); + +struct k8_northbridge_info { + u16 num; + u8 gart_supported; + struct pci_dev **nb_misc; +}; +extern struct k8_northbridge_info k8_northbridges; + +#ifdef CONFIG_AMD_NB + +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; +} + +#else + +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return NULL; +} +#endif + + +#endif /* _ASM_X86_AMD_NB_H */ diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h deleted file mode 100644 index 9cee145dcace..000000000000 --- a/arch/x86/include/asm/k8.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _ASM_X86_K8_H -#define _ASM_X86_K8_H - -#include - -extern struct pci_device_id k8_nb_ids[]; -struct bootnode; - -extern int early_is_k8_nb(u32 value); -extern int cache_k8_northbridges(void); -extern void k8_flush_garts(void); -extern int k8_get_nodes(struct bootnode *nodes); -extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); -extern int k8_scan_nodes(void); - -struct k8_northbridge_info { - u16 num; - u8 gart_supported; - struct pci_dev **nb_misc; -}; -extern struct k8_northbridge_info k8_northbridges; - -#ifdef CONFIG_K8_NB - -static inline struct pci_dev *node_to_k8_nb_misc(int node) -{ - return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; -} - -#else - -static inline struct pci_dev *node_to_k8_nb_misc(int node) -{ - return NULL; -} -#endif - - -#endif /* _ASM_X86_K8_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0925676266bd..25dc82d3e76d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,7 +87,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o -obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c new file mode 100644 index 000000000000..4ffc38df7ac5 --- /dev/null +++ b/arch/x86/kernel/amd_nb.c @@ -0,0 +1,145 @@ +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + */ +#include +#include +#include +#include +#include +#include +#include + +static u32 *flush_words; + +struct pci_device_id k8_nb_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + {} +}; +EXPORT_SYMBOL(k8_nb_ids); + +struct k8_northbridge_info k8_northbridges; +EXPORT_SYMBOL(k8_northbridges); + +static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(&k8_nb_ids[0], dev)); + return dev; +} + +int cache_k8_northbridges(void) +{ + int i; + struct pci_dev *dev; + + if (k8_northbridges.num) + return 0; + + dev = NULL; + while ((dev = next_k8_northbridge(dev)) != NULL) + k8_northbridges.num++; + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) + k8_northbridges.gart_supported = 1; + + k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * + sizeof(void *), GFP_KERNEL); + if (!k8_northbridges.nb_misc) + return -ENOMEM; + + if (!k8_northbridges.num) { + k8_northbridges.nb_misc[0] = NULL; + return 0; + } + + if (k8_northbridges.gart_supported) { + flush_words = kmalloc(k8_northbridges.num * sizeof(u32), + GFP_KERNEL); + if (!flush_words) { + kfree(k8_northbridges.nb_misc); + return -ENOMEM; + } + } + + dev = NULL; + i = 0; + while ((dev = next_k8_northbridge(dev)) != NULL) { + k8_northbridges.nb_misc[i] = dev; + if (k8_northbridges.gart_supported) + pci_read_config_dword(dev, 0x9c, &flush_words[i++]); + } + k8_northbridges.nb_misc[i] = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(cache_k8_northbridges); + +/* Ignores subdevice/subvendor but as far as I can figure out + they're useless anyways */ +int __init early_is_k8_nb(u32 device) +{ + struct pci_device_id *id; + u32 vendor = device & 0xffff; + device >>= 16; + for (id = k8_nb_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return 1; + return 0; +} + +void k8_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + if (!k8_northbridges.gart_supported) + return; + + /* Avoid races between AGP and IOMMU. In theory it's not needed + but I'm not sure if the hardware won't lose flush requests + when another is pending. This whole thing is so expensive anyways + that it doesn't matter to serialize more. -AK */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < k8_northbridges.num; i++) { + pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, + flush_words[i]|1); + flushed++; + } + for (i = 0; i < k8_northbridges.num; i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(k8_northbridges.nb_misc[i], + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + printk("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(k8_flush_garts); + +static __init int init_k8_nbs(void) +{ + int err = 0; + + err = cache_k8_northbridges(); + + if (err < 0) + printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_k8_nbs); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e17..e91e042a5c8b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include int gart_iommu_aperture; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6fdfb0b20f8c..12cd823c8d03 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #define LVL_1_INST 1 @@ -306,7 +306,7 @@ struct _cache_attr { ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); }; -#ifdef CONFIG_K8_NB +#ifdef CONFIG_AMD_NB /* * L3 cache descriptors @@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -#else /* CONFIG_K8_NB */ +#else /* CONFIG_AMD_NB */ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; -#endif /* CONFIG_K8_NB */ +#endif /* CONFIG_AMD_NB */ static int __cpuinit cpuid4_cache_lookup_regs(int index, @@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = { static struct attribute *default_l3_attrs[] = { DEFAULT_SYSFS_CACHE_ATTRS, -#ifdef CONFIG_K8_NB +#ifdef CONFIG_AMD_NB &cache_disable_0.attr, &cache_disable_1.attr, #endif diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c deleted file mode 100644 index 5de1b6b39639..000000000000 --- a/arch/x86/kernel/k8.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Shared support code for AMD K8 northbridges and derivates. - * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. - */ -#include -#include -#include -#include -#include -#include -#include - -static u32 *flush_words; - -struct pci_device_id k8_nb_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, - {} -}; -EXPORT_SYMBOL(k8_nb_ids); - -struct k8_northbridge_info k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); - -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) -{ - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); - return dev; -} - -int cache_k8_northbridges(void) -{ - int i; - struct pci_dev *dev; - - if (k8_northbridges.num) - return 0; - - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - k8_northbridges.num++; - - /* some CPU families (e.g. family 0x11) do not support GART */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) - k8_northbridges.gart_supported = 1; - - k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * - sizeof(void *), GFP_KERNEL); - if (!k8_northbridges.nb_misc) - return -ENOMEM; - - if (!k8_northbridges.num) { - k8_northbridges.nb_misc[0] = NULL; - return 0; - } - - if (k8_northbridges.gart_supported) { - flush_words = kmalloc(k8_northbridges.num * sizeof(u32), - GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges.nb_misc); - return -ENOMEM; - } - } - - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges.nb_misc[i] = dev; - if (k8_northbridges.gart_supported) - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges.nb_misc[i] = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(cache_k8_northbridges); - -/* Ignores subdevice/subvendor but as far as I can figure out - they're useless anyways */ -int __init early_is_k8_nb(u32 device) -{ - struct pci_device_id *id; - u32 vendor = device & 0xffff; - device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) - if (vendor == id->vendor && device == id->device) - return 1; - return 0; -} - -void k8_flush_garts(void) -{ - int flushed, i; - unsigned long flags; - static DEFINE_SPINLOCK(gart_lock); - - if (!k8_northbridges.gart_supported) - return; - - /* Avoid races between AGP and IOMMU. In theory it's not needed - but I'm not sure if the hardware won't lose flush requests - when another is pending. This whole thing is so expensive anyways - that it doesn't matter to serialize more. -AK */ - spin_lock_irqsave(&gart_lock, flags); - flushed = 0; - for (i = 0; i < k8_northbridges.num; i++) { - pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, - flush_words[i]|1); - flushed++; - } - for (i = 0; i < k8_northbridges.num; i++) { - u32 w; - /* Make sure the hardware actually executed the flush*/ - for (;;) { - pci_read_config_dword(k8_northbridges.nb_misc[i], - 0x9c, &w); - if (!(w & 1)) - break; - cpu_relax(); - } - } - spin_unlock_irqrestore(&gart_lock, flags); - if (!flushed) - printk("nothing to flush?\n"); -} -EXPORT_SYMBOL_GPL(k8_flush_garts); - -static __init int init_k8_nbs(void) -{ - int err = 0; - - err = cache_k8_northbridges(); - - if (err < 0) - printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); - - return err; -} - -/* This has to go after the PCI subsystem */ -fs_initcall(init_k8_nbs); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 8f214a2643fa..67e5665ce42c 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include static unsigned long iommu_bus_base; /* GART remapping area (physical) */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c3a4fbb2b996..77eea0362a49 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -107,7 +107,7 @@ #include #include #include -#include +#include #ifdef CONFIG_X86_64 #include #endif diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 970ed579d4e4..ab75b181812f 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include static struct bootnode __initdata nodes[8]; static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a7bcc23ef96c..4962f1aeda6f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -- cgit v1.2.3 From ce5f68246bf2385d6174856708d0b746dc378f20 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 20 Sep 2010 13:04:45 -0700 Subject: x86, hotplug: In the MWAIT case of play_dead, CLFLUSH the cache line When we're using MWAIT for play_dead, explicitly CLFLUSH the cache line before executing MONITOR. This is a potential workaround for the Xeon 7400 erratum AAI65 after having a spurious wakeup and returning around the loop. "Potential" here because it is not certain that that erratum could actually trigger; however, the CLFLUSH should be harmless. Signed-off-by: H. Peter Anvin Acked-by: Venkatesh Pallipadi Cc: Asit Mallick Cc: Arjan van de Ven Cc: Len Brown --- arch/x86/kernel/smpboot.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 55c80ffb8719..fdccfe9dc63d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1394,9 +1394,12 @@ static inline void mwait_play_dead(void) unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; int i; + void *mwait_ptr; if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) return; + if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) + return; if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) return; @@ -1422,10 +1425,25 @@ static inline void mwait_play_dead(void) (highest_subcstate - 1); } + /* + * This should be a memory location in a cache line which is + * unlikely to be touched by other processors. The actual + * content is immaterial as it is not actually modified in any way. + */ + mwait_ptr = ¤t_thread_info()->flags; + wbinvd(); while (1) { - __monitor(¤t_thread_info()->flags, 0, 0); + /* + * The CLFLUSH is a workaround for erratum AAI65 for + * the Xeon 7400 series. It's not clear it is actually + * needed, but it should be harmless in either case. + * The WBINVD is insufficient due to the spurious-wakeup + * case where we return around the loop. + */ + clflush(mwait_ptr); + __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); } -- cgit v1.2.3 From c2b9ff24a0df649d4d40947878b5b5ac39c7299e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 20 Sep 2010 18:01:46 -0700 Subject: x86, cpu: Re-run get_cpu_cap() after adjusting the CPUID level At least on Intel, adjusting the max CPUID level can expose new CPUID features, so we need to re-run get_cpu_cap() after changing the CPUID level. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/cpu.h | 1 + arch/x86/kernel/cpu/intel.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 490dac63c2d2..f2f9ac7da25c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -545,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } -static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; u32 ebx; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 3624e8a0f71b..c16456bc11a7 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -32,6 +32,7 @@ struct cpu_dev { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 85f69cdeae10..b4389441efbb 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); + get_cpu_cap(c); } } -- cgit v1.2.3 From 234bb549eea16ec7d5207ba747fb8dbf489e64c1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 13:46:34 +0100 Subject: x86, cleanups: Use clear_page/copy_page rather than memset/memcpy When operating on whole pages, use clear_page() and copy_page() in favor of memset() and memcpy(); after all that's what they are intended for. Signed-off-by: Jan Beulich LKML-Reference: <4C7FB8CA0200007800013F51@vpn.id2.novell.com> Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 4 ++-- arch/x86/kvm/lapic.c | 3 +-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 035c8c529181..b3ea9db39db6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pud = (pud_t *)page_address(page); - memset(pud, 0, PAGE_SIZE); + clear_page(pud); set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); } pud = pud_offset(pgd, addr); @@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pmd = (pmd_t *)page_address(page); - memset(pmd, 0, PAGE_SIZE); + clear_page(pmd); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } pmd = pmd_offset(pud, addr); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 77d8c0f4817d..22b06f7660f4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL); + apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (apic->regs_page == NULL) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } apic->regs = page_address(apic->regs_page); - memset(apic->regs, 0, PAGE_SIZE); apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bca79091b9d6..558f2d332076 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,7 +67,7 @@ static __init void *alloc_low_page(void) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); return adr; } @@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE] static inline void save_pg_dir(void) { - memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); + copy_page(swsusp_pg_dir, swapper_pg_dir); } #else /* !CONFIG_ACPI_SLEEP */ static inline void save_pg_dir(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..7c48ad4faca3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -293,7 +293,7 @@ static __ref void *alloc_low_page(unsigned long *phys) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); *phys = pfn * PAGE_SIZE; return adr; } -- cgit v1.2.3 From 76fb657017588a0912f0d1d140cb807446e4ef05 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 23 Sep 2010 17:28:04 +0100 Subject: x86, olpc: Only enable PCI configuration type override on XO-1 This configuration type override is for XO-1 only and must not happen on XO-1.5. Signed-off-by: Daniel Drake LKML-Reference: <20100923162805.0F6549D401B@zog.reactivated.net> Cc: Andres Solomon Cc: Grant Likely Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- arch/x86/kernel/olpc.c | 6 ++++-- arch/x86/pci/olpc.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..0ed4c9bfcd13 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1900,7 +1900,7 @@ config PCI_GODIRECT bool "Direct" config PCI_GOOLPC - bool "OLPC" + bool "OLPC XO-1" depends on OLPC config PCI_GOANY diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 0e0cdde519be..635888cf050d 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -242,8 +242,10 @@ static int __init olpc_init(void) (unsigned char *) &olpc_platform_info.ecver, 1); #ifdef CONFIG_PCI_OLPC - /* If the VSA exists let it emulate PCI, if not emulate in kernel */ - if (!cs5535_has_vsa2()) + /* If the VSA exists let it emulate PCI, if not emulate in kernel. + * XO-1 only. */ + if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) && + !cs5535_has_vsa2()) x86_init.pci.arch_init = pci_olpc_init; #endif diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b34815408f58..13700ec8e2e4 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = { int __init pci_olpc_init(void) { - printk(KERN_INFO "PCI: Using configuration type OLPC\n"); + printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n"); raw_pci_ops = &pci_olpc_conf; is_lx = is_geode_lx(); return 0; -- cgit v1.2.3 From 3e3c486012a3dbb113c0ca15ee265d309d77aea9 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 23 Sep 2010 17:28:46 +0100 Subject: x86, olpc: Rework BIOS signature check The XO-1.5 laptop is not currently detected as an OLPC machine because it fails this XO-1-centric check. Now that we have OLPC OFW support in the kernel, a more sensible check is to see if we found OFW during boot and check the architecture property. Also remove a now-meaningless codepath, as we're always going to have OFW support with OLPC. Signed-off-by: Daniel Drake LKML-Reference: <20100923162846.D8D409D401B@zog.reactivated.net> Cc: Andres Salomon Cc: Grant Likely Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 3 ++- arch/x86/include/asm/olpc_ofw.h | 4 +++ arch/x86/kernel/olpc.c | 58 ++++++++++++++++++----------------------- arch/x86/kernel/olpc_ofw.c | 6 +++++ 4 files changed, 37 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0ed4c9bfcd13..c25525571347 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2061,6 +2061,7 @@ config SCx200HR_TIMER config OLPC bool "One Laptop Per Child support" select GPIOLIB + select OLPC_OPENFIRMWARE ---help--- Add support for detecting the unique features of the OLPC XO hardware. @@ -2068,7 +2069,7 @@ config OLPC config OLPC_OPENFIRMWARE bool "Support for OLPC's Open Firmware" depends on !X86_64 && !X86_PAE - default y if OLPC + default n help This option adds support for the implementation of Open Firmware that is used on the OLPC XO-1 Children's Machine. diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 08fde475cb3b..2a8478140bb3 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void); /* install OFW's pde permanently into the kernel's pgtable */ extern void setup_olpc_ofw_pgd(void); +/* check if OFW was detected during boot */ +extern bool olpc_ofw_present(void); + #else /* !CONFIG_OLPC_OPENFIRMWARE */ static inline void olpc_ofw_detect(void) { } static inline void setup_olpc_ofw_pgd(void) { } +static inline bool olpc_ofw_present(void) { return false; } #endif /* !CONFIG_OLPC_OPENFIRMWARE */ diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 635888cf050d..37c49934c785 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -183,8 +183,21 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -#ifdef CONFIG_OLPC_OPENFIRMWARE -static void __init platform_detect(void) +static bool __init check_ofw_architecture(void) +{ + size_t propsize; + char olpc_arch[5]; + const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; + void *res[] = { &propsize }; + + if (olpc_ofw("getprop", args, res)) { + printk(KERN_ERR "ofw: getprop call failed!\n"); + return false; + } + return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; +} + +static u32 __init get_board_revision(void) { size_t propsize; __be32 rev; @@ -193,46 +206,27 @@ static void __init platform_detect(void) if (olpc_ofw("getprop", args, res) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); - rev = cpu_to_be32(0); + return cpu_to_be32(0); } - olpc_platform_info.boardrev = be32_to_cpu(rev); + return be32_to_cpu(rev); } -#else -static void __init platform_detect(void) + +static bool __init platform_detect(void) { - /* stopgap until OFW support is added to the kernel */ - olpc_platform_info.boardrev = olpc_board(0xc2); + if (!check_ofw_architecture()) + return false; + olpc_platform_info.flags |= OLPC_F_PRESENT; + olpc_platform_info.boardrev = get_board_revision(); + return true; } -#endif static int __init olpc_init(void) { - unsigned char *romsig; - - /* The ioremap check is dangerous; limit what we run it on */ - if (!is_geode() || cs5535_has_vsa2()) + if (!olpc_ofw_present() || !platform_detect()) return 0; spin_lock_init(&ec_lock); - romsig = ioremap(0xffffffc0, 16); - if (!romsig) - return 0; - - if (strncmp(romsig, "CL1 Q", 7)) - goto unmap; - if (strncmp(romsig+6, romsig+13, 3)) { - printk(KERN_INFO "OLPC BIOS signature looks invalid. " - "Assuming not OLPC\n"); - goto unmap; - } - - printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig); - olpc_platform_info.flags |= OLPC_F_PRESENT; - - /* get the platform revision */ - platform_detect(); - /* assume B1 and above models always have a DCON */ if (olpc_board_at_least(olpc_board(0xb1))) olpc_platform_info.flags |= OLPC_F_DCON; @@ -254,8 +248,6 @@ static int __init olpc_init(void) olpc_platform_info.boardrev >> 4, olpc_platform_info.ecver); -unmap: - iounmap(romsig); return 0; } diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index 3218aa71ab5e..787320464379 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, } EXPORT_SYMBOL_GPL(__olpc_ofw); +bool olpc_ofw_present(void) +{ + return olpc_ofw_cif != NULL; +} +EXPORT_SYMBOL_GPL(olpc_ofw_present); + /* OFW cif _should_ be above this address */ #define OFW_MIN 0xff000000 -- cgit v1.2.3 From 3b4b682becdfa9f42321aa024d5cc84f71f06d8c Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Fri, 17 Sep 2010 03:12:40 +0800 Subject: x86, mem: Optimize memmove for small size and unaligned cases movs instruction will combine data to accelerate moving data, however we need to concern two cases about it. 1. movs instruction need long lantency to startup, so here we use general mov instruction to copy data. 2. movs instruction is not good for unaligned case, even if src offset is 0x10, dest offset is 0x0, we avoid and handle the case by general mov instruction. Signed-off-by: Ma Ling LKML-Reference: <1284664360-6138-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/lib/memcpy_32.c | 213 ++++++++++++++++++++++++++++++++++++------- arch/x86/lib/memmove_64.c | 225 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 362 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 81130d477ee2..b908a59eccf5 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -22,36 +22,187 @@ EXPORT_SYMBOL(memset); void *memmove(void *dest, const void *src, size_t n) { - int d0, d1, d2; - - if (dest < src) { - if ((dest + n) < src) - return memcpy(dest, src, n); - else - __asm__ __volatile__( - "rep\n\t" - "movsb\n\t" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (src), - "2" (dest) - :"memory"); - } else { - if((src + n) < dest) - return memcpy(dest, src, n); - else - __asm__ __volatile__( - "std\n\t" - "rep\n\t" - "movsb\n\t" - "cld" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (n-1+src), - "2" (n-1+dest) - :"memory"); - } - - return dest; + int d0,d1,d2,d3,d4,d5; + char *ret = dest; + + __asm__ __volatile__( + /* Handle more 16bytes in loop */ + "cmp $0x10, %0\n\t" + "jb 1f\n\t" + + /* Decide forward/backward copy mode */ + "cmp %2, %1\n\t" + "jb 2f\n\t" + + /* + * movs instruction have many startup latency + * so we handle small size by general register. + */ + "cmp $680, %0\n\t" + "jb 3f\n\t" + /* + * movs instruction is only good for aligned case. + */ + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 4f\n\t" + "3:\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16byts forward in each loop. + */ + "3:\n\t" + "sub $0x10, %0\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov 2*4(%1), %3\n\t" + "mov 3*4(%1), %4\n\t" + "mov %3, 2*4(%2)\n\t" + "mov %4, 3*4(%2)\n\t" + "lea 0x10(%1), %1\n\t" + "lea 0x10(%2), %2\n\t" + "jae 3b\n\t" + "add $0x10, %0\n\t" + "jmp 1f\n\t" + + /* + * Handle data forward by movs. + */ + ".p2align 4\n\t" + "4:\n\t" + "mov -4(%1, %0), %3\n\t" + "lea -4(%2, %0), %4\n\t" + "shr $2, %0\n\t" + "rep movsl\n\t" + "mov %3, (%4)\n\t" + "jmp 11f\n\t" + /* + * Handle data backward by movs. + */ + ".p2align 4\n\t" + "6:\n\t" + "mov (%1), %3\n\t" + "mov %2, %4\n\t" + "lea -4(%1, %0), %1\n\t" + "lea -4(%2, %0), %2\n\t" + "shr $2, %0\n\t" + "std\n\t" + "rep movsl\n\t" + "mov %3,(%4)\n\t" + "cld\n\t" + "jmp 11f\n\t" + + /* + * Start to prepare for backward copy. + */ + ".p2align 4\n\t" + "2:\n\t" + "cmp $680, %0\n\t" + "jb 5f\n\t" + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 6b\n\t" + + /* + * Calculate copy position to tail. + */ + "5:\n\t" + "add %0, %1\n\t" + "add %0, %2\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16byts backward in each loop. + */ + "7:\n\t" + "sub $0x10, %0\n\t" + + "mov -1*4(%1), %3\n\t" + "mov -2*4(%1), %4\n\t" + "mov %3, -1*4(%2)\n\t" + "mov %4, -2*4(%2)\n\t" + "mov -3*4(%1), %3\n\t" + "mov -4*4(%1), %4\n\t" + "mov %3, -3*4(%2)\n\t" + "mov %4, -4*4(%2)\n\t" + "lea -0x10(%1), %1\n\t" + "lea -0x10(%2), %2\n\t" + "jae 7b\n\t" + /* + * Calculate copy position to head. + */ + "add $0x10, %0\n\t" + "sub %0, %1\n\t" + "sub %0, %2\n\t" + + /* + * Move data from 8 bytes to 15 bytes. + */ + ".p2align 4\n\t" + "1:\n\t" + "cmp $8, %0\n\t" + "jb 8f\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov -2*4(%1, %0), %5\n\t" + "mov -1*4(%1, %0), %1\n\t" + + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov %5, -2*4(%2, %0)\n\t" + "mov %1, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 4 bytes to 7 bytes. + */ + ".p2align 4\n\t" + "8:\n\t" + "cmp $4, %0\n\t" + "jb 9f\n\t" + "mov 0*4(%1), %3\n\t" + "mov -1*4(%1, %0), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 2 bytes to 3 bytes. + */ + ".p2align 4\n\t" + "9:\n\t" + "cmp $2, %0\n\t" + "jb 10f\n\t" + "movw 0*2(%1), %%dx\n\t" + "movw -1*2(%1, %0), %%bx\n\t" + "movw %%dx, 0*2(%2)\n\t" + "movw %%bx, -1*2(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data for 1 byte. + */ + ".p2align 4\n\t" + "10:\n\t" + "cmp $1, %0\n\t" + "jb 11f\n\t" + "movb (%1), %%cl\n\t" + "movb %%cl, (%2)\n\t" + ".p2align 4\n\t" + "11:" + : "=&c" (d0), "=&S" (d1), "=&D" (d2), + "=r" (d3),"=r" (d4), "=r"(d5) + :"0" (n), + "1" (src), + "2" (dest) + :"memory"); + + return ret; + } EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index ecacc4b3d9e5..6d0f0ec41b34 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -8,50 +8,185 @@ #undef memmove void *memmove(void *dest, const void *src, size_t count) { - unsigned long d0, d1, d2, d3; - if (dest < src) { - if ((dest + count) < src) - return memcpy(dest, src, count); - else - __asm__ __volatile__( - "movq %0, %3\n\t" - "shr $3, %0\n\t" - "andq $7, %3\n\t" - "rep\n\t" - "movsq\n\t" - "movq %3, %0\n\t" - "rep\n\t" - "movsb" - : "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3) - :"0" (count), - "1" (src), - "2" (dest) - :"memory"); - } else { - if((src + count) < dest) - return memcpy(dest, src, count); - else - __asm__ __volatile__( - "movq %0, %3\n\t" - "lea -8(%1, %0), %1\n\t" - "lea -8(%2, %0), %2\n\t" - "shr $3, %0\n\t" - "andq $7, %3\n\t" - "std\n\t" - "rep\n\t" - "movsq\n\t" - "lea 7(%1), %1\n\t" - "lea 7(%2), %2\n\t" - "movq %3, %0\n\t" - "rep\n\t" - "movsb\n\t" - "cld" - : "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3) - :"0" (count), - "1" (src), - "2" (dest) - :"memory"); - } - return dest; + unsigned long d0,d1,d2,d3,d4,d5,d6,d7; + char *ret; + + __asm__ __volatile__( + /* Handle more 32bytes in loop */ + "mov %2, %3\n\t" + "cmp $0x20, %0\n\t" + "jb 1f\n\t" + + /* Decide forward/backward copy mode */ + "cmp %2, %1\n\t" + "jb 2f\n\t" + + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + "cmp $680, %0\n\t" + "jb 3f\n\t" + /* + * movsq instruction is only good for aligned case. + */ + "cmpb %%dil, %%sil\n\t" + "je 4f\n\t" + "3:\n\t" + "sub $0x20, %0\n\t" + /* + * We gobble 32byts forward in each loop. + */ + "5:\n\t" + "sub $0x20, %0\n\t" + "movq 0*8(%1), %4\n\t" + "movq 1*8(%1), %5\n\t" + "movq 2*8(%1), %6\n\t" + "movq 3*8(%1), %7\n\t" + "leaq 4*8(%1), %1\n\t" + + "movq %4, 0*8(%2)\n\t" + "movq %5, 1*8(%2)\n\t" + "movq %6, 2*8(%2)\n\t" + "movq %7, 3*8(%2)\n\t" + "leaq 4*8(%2), %2\n\t" + "jae 5b\n\t" + "addq $0x20, %0\n\t" + "jmp 1f\n\t" + /* + * Handle data forward by movsq. + */ + ".p2align 4\n\t" + "4:\n\t" + "movq %0, %8\n\t" + "movq -8(%1, %0), %4\n\t" + "lea -8(%2, %0), %5\n\t" + "shrq $3, %8\n\t" + "rep movsq\n\t" + "movq %4, (%5)\n\t" + "jmp 13f\n\t" + /* + * Handle data backward by movsq. + */ + ".p2align 4\n\t" + "7:\n\t" + "movq %0, %8\n\t" + "movq (%1), %4\n\t" + "movq %2, %5\n\t" + "leaq -8(%1, %0), %1\n\t" + "leaq -8(%2, %0), %2\n\t" + "shrq $3, %8\n\t" + "std\n\t" + "rep movsq\n\t" + "cld\n\t" + "movq %4, (%5)\n\t" + "jmp 13f\n\t" + + /* + * Start to prepare for backward copy. + */ + ".p2align 4\n\t" + "2:\n\t" + "cmp $680, %0\n\t" + "jb 6f \n\t" + "cmp %%dil, %%sil\n\t" + "je 7b \n\t" + "6:\n\t" + /* + * Calculate copy position to tail. + */ + "addq %0, %1\n\t" + "addq %0, %2\n\t" + "subq $0x20, %0\n\t" + /* + * We gobble 32byts backward in each loop. + */ + "8:\n\t" + "subq $0x20, %0\n\t" + "movq -1*8(%1), %4\n\t" + "movq -2*8(%1), %5\n\t" + "movq -3*8(%1), %6\n\t" + "movq -4*8(%1), %7\n\t" + "leaq -4*8(%1), %1\n\t" + + "movq %4, -1*8(%2)\n\t" + "movq %5, -2*8(%2)\n\t" + "movq %6, -3*8(%2)\n\t" + "movq %7, -4*8(%2)\n\t" + "leaq -4*8(%2), %2\n\t" + "jae 8b\n\t" + /* + * Calculate copy position to head. + */ + "addq $0x20, %0\n\t" + "subq %0, %1\n\t" + "subq %0, %2\n\t" + "1:\n\t" + "cmpq $16, %0\n\t" + "jb 9f\n\t" + /* + * Move data from 16 bytes to 31 bytes. + */ + "movq 0*8(%1), %4\n\t" + "movq 1*8(%1), %5\n\t" + "movq -2*8(%1, %0), %6\n\t" + "movq -1*8(%1, %0), %7\n\t" + "movq %4, 0*8(%2)\n\t" + "movq %5, 1*8(%2)\n\t" + "movq %6, -2*8(%2, %0)\n\t" + "movq %7, -1*8(%2, %0)\n\t" + "jmp 13f\n\t" + ".p2align 4\n\t" + "9:\n\t" + "cmpq $8, %0\n\t" + "jb 10f\n\t" + /* + * Move data from 8 bytes to 15 bytes. + */ + "movq 0*8(%1), %4\n\t" + "movq -1*8(%1, %0), %5\n\t" + "movq %4, 0*8(%2)\n\t" + "movq %5, -1*8(%2, %0)\n\t" + "jmp 13f\n\t" + "10:\n\t" + "cmpq $4, %0\n\t" + "jb 11f\n\t" + /* + * Move data from 4 bytes to 7 bytes. + */ + "movl (%1), %4d\n\t" + "movl -4(%1, %0), %5d\n\t" + "movl %4d, (%2)\n\t" + "movl %5d, -4(%2, %0)\n\t" + "jmp 13f\n\t" + "11:\n\t" + "cmp $2, %0\n\t" + "jb 12f\n\t" + /* + * Move data from 2 bytes to 3 bytes. + */ + "movw (%1), %4w\n\t" + "movw -2(%1, %0), %5w\n\t" + "movw %4w, (%2)\n\t" + "movw %5w, -2(%2, %0)\n\t" + "jmp 13f\n\t" + "12:\n\t" + "cmp $1, %0\n\t" + "jb 13f\n\t" + /* + * Move data for 1 byte. + */ + "movb (%1), %4b\n\t" + "movb %4b, (%2)\n\t" + "13:\n\t" + : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) , + "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7) + :"0" (count), + "1" (src), + "2" (dest) + :"memory"); + + return ret; + } EXPORT_SYMBOL(memmove); -- cgit v1.2.3 From 3fdbf004c1706480a7c7fac3c9d836fa6df20d7d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 30 Sep 2010 14:32:35 +0200 Subject: x86, mtrr: Assume SYS_CFG[Tom2ForceMemTypeWB] exists on all future AMD CPUs Instead of adapting the CPU family check in amd_special_default_mtrr() for each new CPU family assume that all new AMD CPUs support the necessary bits in SYS_CFG MSR. Tom2Enabled is architectural (defined in APM Vol.2). Tom2ForceMemTypeWB is defined in all BKDGs starting with K8 NPT. In pre K8-NPT BKDG this bit is reserved (read as zero). W/o this adaption Linux would unnecessarily complain about bad MTRR settings on every new AMD CPU family, e.g. [ 0.000000] WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing 4863MB of RAM. Cc: stable@kernel.org # .32.x, .35.x Signed-off-by: Andreas Herrmann LKML-Reference: <20100930123235.GB20545@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index c5f59d071425..ac140c7be396 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) -- cgit v1.2.3 From 420b13b60a3e5c5dcc6ec290e131cf5fbc603d94 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 30 Sep 2010 14:33:58 +0200 Subject: x86, nmi: Support NMI watchdog on newer AMD CPU families CPU families 0x12, 0x14 and 0x15 support this functionality. Signed-off-by: Andreas Herrmann LKML-Reference: <20100930123357.GC20545@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/perfctr-watchdog.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index fb329e9f8494..d9f4ff8fcd69 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) - return; - wd_ops = &k7_wd_ops; - break; + if (boot_cpu_data.x86 == 6 || + (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) + wd_ops = &k7_wd_ops; + return; case X86_VENDOR_INTEL: /* Work around where perfctr1 doesn't have a working enable * bit as described in the following errata: -- cgit v1.2.3 From 23588c38a84c9175c6668789b64ffba4651e5c6a Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 30 Sep 2010 14:36:28 +0200 Subject: x86, amd: Add support for CPUID topology extension of AMD CPUs Node information (ID, number of internal nodes) is provided via CPUID Fn8000_001e_ECX. See AMD CPUID Specification (Publication # 25481, Revision 2.34, September 2010). Signed-off-by: Andreas Herrmann LKML-Reference: <20100930123628.GD20545@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 50 ++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0f0ace5d7db5..7e6a37d24253 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -253,37 +253,41 @@ static int __cpuinit nearby_node(int apicid) #endif /* - * Fixup core topology information for AMD multi-node processors. - * Assumption: Number of cores in each internal node is the same. + * Fixup core topology information for + * (1) AMD multi-node processors + * Assumption: Number of cores in each internal node is the same. */ #ifdef CONFIG_X86_HT -static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) +static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { - unsigned long long value; u32 nodes, cores_per_node; + u8 node_id; + unsigned long long value; int cpu = smp_processor_id(); - if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) - return; - - /* fixup topology information only once for a core */ - if (cpu_has(c, X86_FEATURE_AMD_DCM)) + /* get information required for multi-node processors */ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + value = cpuid_ecx(0x8000001e); + nodes = ((value >> 8) & 7) + 1; + node_id = value & 7; + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes = ((value >> 3) & 7) + 1; + node_id = value & 7; + } else return; - rdmsrl(MSR_FAM10H_NODE_ID, value); + /* fixup multi-node processor information */ + if (nodes > 1) { + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; - nodes = ((value >> 3) & 7) + 1; - if (nodes == 1) - return; - - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cores_per_node = c->x86_max_cores / nodes; + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = node_id; - /* store NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = value & 7; - - /* fixup core id to be in range from 0 to (cores_per_node - 1) */ - c->cpu_core_id = c->cpu_core_id % cores_per_node; + /* core id to be in range from 0 to (cores_per_node - 1) */ + c->cpu_core_id = c->cpu_core_id % cores_per_node; + } } #endif @@ -304,9 +308,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; - /* fixup topology information on multi-node processors */ - if ((c->x86 == 0x10) && (c->x86_model == 9)) - amd_fixup_dcm(c); + amd_get_topology(c); #endif } -- cgit v1.2.3 From 6057b4d331f19a3ea51aec463ea7839c128b3227 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 30 Sep 2010 14:38:57 +0200 Subject: x86, amd: Extract compute unit information for AMD CPUs Get compute unit information from CPUID Fn8000_001E_EBX. (See AMD CPUID Specification - publication # 25481, revision 2.34, September 2010.) Note that each core on a compute unit still has a core_id of its own. Signed-off-by: Andreas Herrmann LKML-Reference: <20100930123857.GE20545@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/amd.c | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 325b7bdbebaa..69e80c2ec6c2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -110,6 +110,8 @@ struct cpuinfo_x86 { u16 phys_proc_id; /* Core id: */ u16 cpu_core_id; + /* Compute unit id */ + u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; #endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e6a37d24253..70168ab88b7f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -256,21 +256,29 @@ static int __cpuinit nearby_node(int apicid) * Fixup core topology information for * (1) AMD multi-node processors * Assumption: Number of cores in each internal node is the same. + * (2) AMD processors supporting compute units */ #ifdef CONFIG_X86_HT static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { - u32 nodes, cores_per_node; + u32 nodes; u8 node_id; - unsigned long long value; int cpu = smp_processor_id(); /* get information required for multi-node processors */ if (cpu_has(c, X86_FEATURE_TOPOEXT)) { - value = cpuid_ecx(0x8000001e); - nodes = ((value >> 8) & 7) + 1; - node_id = value & 7; + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + nodes = ((ecx >> 8) & 7) + 1; + node_id = ecx & 7; + + /* get compute unit information */ + smp_num_siblings = ((ebx >> 8) & 3) + 1; + c->compute_unit_id = ebx & 0xff; } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + rdmsrl(MSR_FAM10H_NODE_ID, value); nodes = ((value >> 3) & 7) + 1; node_id = value & 7; @@ -279,6 +287,8 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* fixup multi-node processor information */ if (nodes > 1) { + u32 cores_per_node; + set_cpu_cap(c, X86_FEATURE_AMD_DCM); cores_per_node = c->x86_max_cores / nodes; -- cgit v1.2.3 From d4fbe4f03557e1fd4d9bbb3a1957aad560f39e96 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 30 Sep 2010 14:41:56 +0200 Subject: x86, amd: Use compute unit information to determine thread siblings This information is vital for different load balancing policies. Signed-off-by: Andreas Herrmann LKML-Reference: <20100930124156.GF20545@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8b3bfc4dd708..bc2cc444844a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -397,6 +397,19 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } +static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +{ + struct cpuinfo_x86 *c1 = &cpu_data(cpu1); + struct cpuinfo_x86 *c2 = &cpu_data(cpu2); + + cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); + cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); + cpumask_set_cpu(cpu1, c2->llc_shared_map); + cpumask_set_cpu(cpu2, c1->llc_shared_map); +} + void __cpuinit set_cpu_sibling_map(int cpu) { @@ -409,14 +422,13 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { struct cpuinfo_x86 *o = &cpu_data(i); - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - cpumask_set_cpu(i, cpu_sibling_mask(cpu)); - cpumask_set_cpu(cpu, cpu_sibling_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, o->llc_shared_map); + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (c->phys_proc_id == o->phys_proc_id && + c->compute_unit_id == o->compute_unit_id) + link_thread_siblings(cpu, i); + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + link_thread_siblings(cpu, i); } } } else { -- cgit v1.2.3 From 5c80cc78de46aef6cd5e714208da05c3f7f548f8 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 30 Sep 2010 14:43:16 +0200 Subject: x86, amd_nb: Enable GART support for AMD family 0x15 CPUs AMD CPU family 0x15 still supports GART for compatibility reasons. Signed-off-by: Andreas Herrmann LKML-Reference: <20100930124316.GG20545@loge.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/amd_nb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4ffc38df7ac5..8f6463d8ed0d 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,6 +15,7 @@ static u32 *flush_words; struct pci_device_id k8_nb_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, {} }; EXPORT_SYMBOL(k8_nb_ids); @@ -45,7 +46,8 @@ int cache_k8_northbridges(void) k8_northbridges.num++; /* some CPU families (e.g. family 0x11) do not support GART */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) k8_northbridges.gart_supported = 1; k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * -- cgit v1.2.3 From 3bb9808e99bcc36eecb8e082bf70efb2a0bcdcb7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:46:02 +0000 Subject: x86: Use genirq Kconfig Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121843.314600915@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- arch/x86/Kconfig | 36 +++++------------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..3ec657f7ee70 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -59,6 +59,11 @@ config X86 select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER + select HAVE_GENERIC_HARDIRQS + select HAVE_SPARSE_IRQ + select NUMA_IRQ_DESC if (SPARSE_IRQ && NUMA) + select GENERIC_IRQ_PROBE + select GENERIC_PENDING_IRQ if SMP config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -200,20 +205,6 @@ config HAVE_INTEL_TXT def_bool y depends on EXPERIMENTAL && DMAR && ACPI -# Use the generic interrupt handling code in kernel/irq/: -config GENERIC_HARDIRQS - def_bool y - -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - -config GENERIC_IRQ_PROBE - def_bool y - -config GENERIC_PENDING_IRQ - def_bool y - depends on GENERIC_HARDIRQS && SMP - config USE_GENERIC_SMP_HELPERS def_bool y depends on SMP @@ -296,23 +287,6 @@ config X86_X2APIC If you don't know what to do here, say N. -config SPARSE_IRQ - bool "Support sparse irq numbering" - depends on PCI_MSI || HT_IRQ - ---help--- - This enables support for sparse irqs. This is useful for distro - kernels that want to define a high CONFIG_NR_CPUS value but still - want to have low kernel memory footprint on smaller machines. - - ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread - out the irq_desc[] array in a more NUMA-friendly way. ) - - If you don't know what to do here, say N. - -config NUMA_IRQ_DESC - def_bool y - depends on SPARSE_IRQ && NUMA - config X86_MPPARSE bool "Enable MPS table" if ACPI default y -- cgit v1.2.3 From 366d4a43b1a7a861c356a0e407c4f03f454d42ea Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Oct 2010 09:31:27 +0200 Subject: x86, cpu: Fix X86_FEATURE_NOPL ba0593bf553c450a03dbc5f8c1f0ff58b778a0c8 cleared the aforementioned cpuid bit only on 32-bit due to various problems with Virtual PC. This somehow got lost during the 32- + 64-bit merge so restore the feature bit on 64-bit. For that, set it explicitly for non-constant arguments of cpu_has(). Update comment for future reference. Signed-off-by: Borislav Petkov LKML-Reference: <20101004073127.GA20305@liondog.tnic> Cc: Ryan O'Neill Cc: Linus Torvalds Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f2f9ac7da25c..927e630aeaf3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -704,16 +704,21 @@ void __init early_cpu_init(void) } /* - * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6; unfortunately, that's not true in practice because - * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. In the latter case it doesn't even *fail* - * reliably, so probing for it doesn't even work. Disable it completely + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 clear_cpu_cap(c, X86_FEATURE_NOPL); +#else + set_cpu_cap(c, X86_FEATURE_NOPL); +#endif } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) -- cgit v1.2.3 From 161b0275e2311b8bd9609d5f32e2b703cf5d70a8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Sep 2010 19:35:22 -0700 Subject: x86, mm: Add RESERVE_BRK_ARRAY() helper This is useful when converting static arrays into boot-time brk allocated objects. Signed-off-by: Jeremy Fitzhardinge LKML-Reference: <4C805EEA.1080205@goop.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/setup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ef292c792d74..d6763b139a84 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align); : : "i" (sz)); \ } +/* Helper for reserving space for arrays of things */ +#define RESERVE_BRK_ARRAY(type, name, entries) \ + type *name; \ + RESERVE_BRK(name, sizeof(type) * entries) + #ifdef __i386__ void __init i386_start_kernel(void); -- cgit v1.2.3 From a416e9e1dde0fbcf20cda59df284cc0dcf2aadc4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 29 Sep 2010 23:29:48 +0900 Subject: x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation On 32-bit non-PAE system, cast to 'phys_addr_t' truncates value before subtraction. Subtracting before cast produce same result but remove following warnings from sparse: arch/x86/include/asm/pgtable_types.h:255:38: warning: cast truncates bits from constant value (100000000 becomes 0) arch/x86/include/asm/pgtable_types.h:270:38: warning: cast truncates bits from constant value (100000000 becomes 0) arch/x86/include/asm/pgtable.h:127:32: warning: cast truncates bits from constant value (100000000 becomes 0) arch/x86/include/asm/pgtable.h:132:32: warning: cast truncates bits from constant value (100000000 becomes 0) arch/x86/include/asm/pgtable.h:344:31: warning: cast truncates bits from constant value (100000000 becomes 0) 64-bit or PAE machines will not be affected by this change. Signed-off-by: Namhyung Kim LKML-Reference: <1285770588-14065-1-git-send-email-namhyung@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index a667f24c7254..1df66211fd1b 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -8,7 +8,7 @@ #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) +#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast PAGE_MASK to a signed type so that it is sign-extended if -- cgit v1.2.3 From 55572b293b3a5929e8c54bc91d14ae6264186bf6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 Oct 2010 16:42:54 -0700 Subject: x86, mrst: A function in a header file needs to be marked "inline" A function in a header file needs to be explicitly marked "inline", or gcc will complain if it is not used. Signed-off-by: H. Peter Anvin Cc: Jacob Pan Cc: v2.6.36 LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com> --- arch/x86/include/asm/mrst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 16350740edf6..33fc2966beb7 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -26,7 +26,7 @@ enum mrst_cpu_type { }; extern enum mrst_cpu_type __mrst_cpu_chip; -static enum mrst_cpu_type mrst_identify_cpu(void) +static inline enum mrst_cpu_type mrst_identify_cpu(void) { return __mrst_cpu_chip; } -- cgit v1.2.3 From 5a47c7dae861c3ca3edf178546641909851bf715 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 13 Sep 2010 15:08:54 +0800 Subject: x86: Add two helper macros for fixed address mapping Sometimes fixmap will be used to map an physical address which is not PAGE align, so to use it we need first map it and then add the address offset to the mapped fixed address. These 2 new helpers are suggested by Ingo Molnar to make the process simpler. For a physicall address like "phys", a directly usable virtual address can be get by virt = (void *)set_fixmap_offset(fixed_idx, phys); or virt = (void *)set_fixmap_offset_nocache(fixed_idx, phys); (depends on whether the physical address is cachable or not). Signed-off-by: Feng Tang Cc: alan@linux.intel.com Cc: greg@kroah.com Cc: x86@kernel.org LKML-Reference: <1284361736-23011-3-git-send-email-feng.tang@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index d07b44f7d1dc..4d293dced62f 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); return __virt_to_fix(vaddr); } + +/* Return an pointer with offset calculated */ +static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + __set_fixmap(idx, phys, flags); + return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); +} + +#define set_fixmap_offset(idx, phys) \ + __set_fixmap_offset(idx, phys, PAGE_KERNEL) + +#define set_fixmap_offset_nocache(idx, phys) \ + __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE) + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ -- cgit v1.2.3 From c20b5c3318fe45e4f33f01a91ccead645dfdf619 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 13 Sep 2010 15:08:55 +0800 Subject: x86, earlyprintk: Add earlyprintk for Intel Moorestown platform Intel Moorestown platform has a spi-uart device(Maxim3110), which connects to a Designware spi core controller. This patch will add early console function based on it. As it will be used long before Linux spi subsystem get initialised, we simply directly manipulate the spi controller's register to acheive the early console func. This is safe as it will be disabled when devices subsytem get initialised. To use it, user need enable CONFIG_X86_MRST_EARLY_PRINTK in kenrel config and add "earlyprintk=mrst" in kernel command line. Signed-off-by: Feng Tang Acked-by: Alan Cox Cc: greg@kroah.com LKML-Reference: <1284361736-23011-4-git-send-email-feng.tang@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 4 + arch/x86/include/asm/mrst.h | 5 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/early_printk.c | 7 ++ arch/x86/kernel/early_printk_mrst.c | 232 ++++++++++++++++++++++++++++++++++++ 5 files changed, 249 insertions(+) create mode 100644 arch/x86/kernel/early_printk_mrst.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 75085080b63e..e5bb96b10f1a 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,6 +43,10 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. +config EARLY_PRINTK_MRST + bool "Early printk for MRST platform support" + depends on EARLY_PRINTK && X86_MRST + config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" depends on EARLY_PRINTK && PCI diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 33fc2966beb7..d0ea5bc505a3 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -10,6 +10,9 @@ */ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H + +#include + extern int pci_mrst_init(void); int __init sfi_parse_mrtc(struct sfi_table_header *table); @@ -42,4 +45,6 @@ extern enum mrst_timer_options mrst_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 +extern struct console early_mrst_console; +extern void mrst_early_console_init(void); #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fedf32a8c3ec..3cd01d04613d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index fa99bae75ace..6082463768a2 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -238,6 +239,12 @@ static int __init setup_early_printk(char *buf) #ifdef CONFIG_HVC_XEN if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); +#endif +#ifdef CONFIG_X86_MRST_EARLY_PRINTK + if (!strncmp(buf, "mrst", 4)) { + mrst_early_console_init(); + early_console_register(&early_mrst_console, keep); + } #endif buf++; } diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c new file mode 100644 index 000000000000..05d27e1e2b62 --- /dev/null +++ b/arch/x86/kernel/early_printk_mrst.c @@ -0,0 +1,232 @@ +/* + * early_printk_mrst.c - spi-uart early printk for Intel Moorestown platform + * + * Copyright (c) 2008-2010, Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define MRST_SPI_TIMEOUT 0x200000 +#define MRST_REGBASE_SPI0 0xff128000 +#define MRST_REGBASE_SPI1 0xff128400 +#define MRST_CLK_SPI0_REG 0xff11d86c + +/* Bit fields in CTRLR0 */ +#define SPI_DFS_OFFSET 0 + +#define SPI_FRF_OFFSET 4 +#define SPI_FRF_SPI 0x0 +#define SPI_FRF_SSP 0x1 +#define SPI_FRF_MICROWIRE 0x2 +#define SPI_FRF_RESV 0x3 + +#define SPI_MODE_OFFSET 6 +#define SPI_SCPH_OFFSET 6 +#define SPI_SCOL_OFFSET 7 +#define SPI_TMOD_OFFSET 8 +#define SPI_TMOD_TR 0x0 /* xmit & recv */ +#define SPI_TMOD_TO 0x1 /* xmit only */ +#define SPI_TMOD_RO 0x2 /* recv only */ +#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */ + +#define SPI_SLVOE_OFFSET 10 +#define SPI_SRL_OFFSET 11 +#define SPI_CFS_OFFSET 12 + +/* Bit fields in SR, 7 bits */ +#define SR_MASK 0x7f /* cover 7 bits */ +#define SR_BUSY (1 << 0) +#define SR_TF_NOT_FULL (1 << 1) +#define SR_TF_EMPT (1 << 2) +#define SR_RF_NOT_EMPT (1 << 3) +#define SR_RF_FULL (1 << 4) +#define SR_TX_ERR (1 << 5) +#define SR_DCOL (1 << 6) + +struct dw_spi_reg { + u32 ctrl0; + u32 ctrl1; + u32 ssienr; + u32 mwcr; + u32 ser; + u32 baudr; + u32 txfltr; + u32 rxfltr; + u32 txflr; + u32 rxflr; + u32 sr; + u32 imr; + u32 isr; + u32 risr; + u32 txoicr; + u32 rxoicr; + u32 rxuicr; + u32 msticr; + u32 icr; + u32 dmacr; + u32 dmatdlr; + u32 dmardlr; + u32 idr; + u32 version; + + /* Currently operates as 32 bits, though only the low 16 bits matter */ + u32 dr; +} __packed; + +#define dw_readl(dw, name) __raw_readl(&(dw)->name) +#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name) + +/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */ +static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; + +static u32 *pclk_spi0; +/* Always contains an accessable address, start with 0 */ +static struct dw_spi_reg *pspi; + +static struct kmsg_dumper dw_dumper; +static int dumper_registered; + +static void dw_kmsg_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *s1, unsigned long l1, + const char *s2, unsigned long l2) +{ + int i; + + /* When run to this, we'd better re-init the HW */ + mrst_early_console_init(); + + for (i = 0; i < l1; i++) + early_mrst_console.write(&early_mrst_console, s1 + i, 1); + for (i = 0; i < l2; i++) + early_mrst_console.write(&early_mrst_console, s2 + i, 1); +} + +/* Set the ratio rate to 115200, 8n1, IRQ disabled */ +static void max3110_write_config(void) +{ + u16 config; + + config = 0xc001; + dw_writel(pspi, dr, config); +} + +/* Translate char to a eligible word and send to max3110 */ +static void max3110_write_data(char c) +{ + u16 data; + + data = 0x8000 | c; + dw_writel(pspi, dr, data); +} + +void mrst_early_console_init(void) +{ + u32 ctrlr0 = 0; + u32 spi0_cdiv; + u32 freq; /* Freqency info only need be searched once */ + + /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */ + pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, + MRST_CLK_SPI0_REG); + spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9; + freq = 100000000 / (spi0_cdiv + 1); + + if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL) + mrst_spi_paddr = MRST_REGBASE_SPI1; + + pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, + mrst_spi_paddr); + + /* Disable SPI controller */ + dw_writel(pspi, ssienr, 0); + + /* Set control param, 8 bits, transmit only mode */ + ctrlr0 = dw_readl(pspi, ctrl0); + + ctrlr0 &= 0xfcc0; + ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET) + | (SPI_TMOD_TO << SPI_TMOD_OFFSET); + dw_writel(pspi, ctrl0, ctrlr0); + + /* + * Change the spi0 clk to comply with 115200 bps, use 100000 to + * calculate the clk dividor to make the clock a little slower + * than real baud rate. + */ + dw_writel(pspi, baudr, freq/100000); + + /* Disable all INT for early phase */ + dw_writel(pspi, imr, 0x0); + + /* Set the cs to spi-uart */ + dw_writel(pspi, ser, 0x2); + + /* Enable the HW, the last step for HW init */ + dw_writel(pspi, ssienr, 0x1); + + /* Set the default configuration */ + max3110_write_config(); + + /* Register the kmsg dumper */ + if (!dumper_registered) { + dw_dumper.dump = dw_kmsg_dump; + kmsg_dump_register(&dw_dumper); + dumper_registered = 1; + } +} + +/* Slave select should be called in the read/write function */ +static void early_mrst_spi_putc(char c) +{ + unsigned int timeout; + u32 sr; + + timeout = MRST_SPI_TIMEOUT; + /* Early putc needs to make sure the TX FIFO is not full */ + while (--timeout) { + sr = dw_readl(pspi, sr); + if (!(sr & SR_TF_NOT_FULL)) + cpu_relax(); + else + break; + } + + if (!timeout) + pr_warning("MRST earlycon: timed out\n"); + else + max3110_write_data(c); +} + +/* Early SPI only uses polling mode */ +static void early_mrst_spi_write(struct console *con, const char *str, unsigned n) +{ + int i; + + for (i = 0; i < n && *str; i++) { + if (*str == '\n') + early_mrst_spi_putc('\r'); + early_mrst_spi_putc(*str); + str++; + } +} + +struct console early_mrst_console = { + .name = "earlymrst", + .write = early_mrst_spi_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; -- cgit v1.2.3 From 4d033556f1bfaa7604b951bfadd17aaf1b74e4c5 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 13 Sep 2010 15:08:56 +0800 Subject: x86, earlyprintk: Add hsu early console for Intel Medfield platform Intel Medfield platform has a high speed UART device, which could act as a early console. To enable early printk of HSU console, simply add "earlyprintk=hsu" in kernel command line. Currently we put the code in the early_printk_mrst.c as it is also for Intel MID platforms like the mrst early console Signed-off-by: Feng Tang Acked-by: Alan Cox Cc: greg@kroah.com LKML-Reference: <1284361736-23011-5-git-send-email-feng.tang@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mrst.h | 3 ++ arch/x86/kernel/early_printk.c | 6 +++ arch/x86/kernel/early_printk_mrst.c | 89 ++++++++++++++++++++++++++++++++++++- 3 files changed, 97 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index d0ea5bc505a3..4a711a684b17 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -47,4 +47,7 @@ extern enum mrst_timer_options mrst_timer_options; extern struct console early_mrst_console; extern void mrst_early_console_init(void); + +extern struct console early_hsu_console; +extern void hsu_early_console_init(void); #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 6082463768a2..4572f25f9325 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -245,6 +245,12 @@ static int __init setup_early_printk(char *buf) mrst_early_console_init(); early_console_register(&early_mrst_console, keep); } + + if (!strncmp(buf, "hsu", 3)) { + hsu_early_console_init(); + early_console_register(&early_hsu_console, keep); + } + #endif buf++; } diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c index 05d27e1e2b62..65df603622b2 100644 --- a/arch/x86/kernel/early_printk_mrst.c +++ b/arch/x86/kernel/early_printk_mrst.c @@ -1,5 +1,5 @@ /* - * early_printk_mrst.c - spi-uart early printk for Intel Moorestown platform + * early_printk_mrst.c - early consoles for Intel MID platforms * * Copyright (c) 2008-2010, Intel Corporation * @@ -9,9 +9,19 @@ * of the License. */ +/* + * This file implements two early consoles named mrst and hsu. + * mrst is based on Maxim3110 spi-uart device, it exists in both + * Moorestown and Medfield platforms, while hsu is based on a High + * Speed UART device which only exists in the Medfield platform + */ + +#include +#include #include #include #include +#include #include #include @@ -230,3 +240,80 @@ struct console early_mrst_console = { .flags = CON_PRINTBUFFER, .index = -1, }; + +/* + * Following is the early console based on Medfield HSU (High + * Speed UART) device. + */ +#define HSU_PORT2_PADDR 0xffa28180 + +static void __iomem *phsu; + +void hsu_early_console_init(void) +{ + u8 lcr; + + phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, + HSU_PORT2_PADDR); + + /* Disable FIFO */ + writeb(0x0, phsu + UART_FCR); + + /* Set to default 115200 bps, 8n1 */ + lcr = readb(phsu + UART_LCR); + writeb((0x80 | lcr), phsu + UART_LCR); + writeb(0x18, phsu + UART_DLL); + writeb(lcr, phsu + UART_LCR); + writel(0x3600, phsu + UART_MUL*4); + + writeb(0x8, phsu + UART_MCR); + writeb(0x7, phsu + UART_FCR); + writeb(0x3, phsu + UART_LCR); + + /* Clear IRQ status */ + readb(phsu + UART_LSR); + readb(phsu + UART_RX); + readb(phsu + UART_IIR); + readb(phsu + UART_MSR); + + /* Enable FIFO */ + writeb(0x7, phsu + UART_FCR); +} + +#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) + +static void early_hsu_putc(char ch) +{ + unsigned int timeout = 10000; /* 10ms */ + u8 status; + + while (--timeout) { + status = readb(phsu + UART_LSR); + if (status & BOTH_EMPTY) + break; + udelay(1); + } + + /* Only write the char when there was no timeout */ + if (timeout) + writeb(ch, phsu + UART_TX); +} + +static void early_hsu_write(struct console *con, const char *str, unsigned n) +{ + int i; + + for (i = 0; i < n && *str; i++) { + if (*str == '\n') + early_hsu_putc('\r'); + early_hsu_putc(*str); + str++; + } +} + +struct console early_hsu_console = { + .name = "earlyhsu", + .write = early_hsu_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; -- cgit v1.2.3 From 286e5b97eb22baab9d9a41ca76c6b933a484252c Mon Sep 17 00:00:00 2001 From: Paul Fox Date: Fri, 1 Oct 2010 18:17:19 +0100 Subject: x86, olpc: Don't retry EC commands forever Avoids a potential infinite loop. It was observed once, during an EC hacking/debugging session - not in regular operation. Signed-off-by: Daniel Drake Cc: dilinger@queued.net Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/olpc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 37c49934c785..747261302598 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -114,6 +114,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, unsigned long flags; int ret = -EIO; int i; + int restarts = 0; spin_lock_irqsave(&ec_lock, flags); @@ -169,7 +170,9 @@ restart: if (wait_on_obf(0x6c, 1)) { printk(KERN_ERR "olpc-ec: timeout waiting for" " EC to provide data!\n"); - goto restart; + if (restarts++ < 10) + goto restart; + goto err; } outbuf[i] = inb(0x68); pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); -- cgit v1.2.3 From b62be8ea9db4048112219ff6d6ce5f183179d4dc Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Thu, 26 Aug 2010 17:29:05 +0900 Subject: x86, mce, therm_throt.c: Fix missing curly braces in error handling logic When the feature PTS is not supported by CPU, the sysfile package_power_limit_count for package should not be generated. This patch is used for fixing missing { and }. The patch is not complete as there are other error handling problems in this function - but that can wait until the merge window. Signed-off-by: Jin Dongming Reviewed-by: Fenghua Yu Acked-by: Jean Delvare Cc: Brown Len Cc: Guenter Roeck Cc: Hidetoshi Seto Cc: lm-sensors@lm-sensors.org LKML-Reference: <4C7625D1.4060201@np.css.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d9368eeda309..169d8804a9f8 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -216,7 +216,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_core_power_limit_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PTS)) + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_throttle_count.attr, thermal_attr_group.name); @@ -224,6 +224,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_power_limit_count.attr, thermal_attr_group.name); + } return err; } -- cgit v1.2.3 From 6dcbfe4f0b4e17e289d56fa534b7ce5a6b7f63a3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 8 Oct 2010 12:08:34 +0200 Subject: x86, AMD, MCE thresholding: Fix the MCi_MISCj iteration order This fixes possible cases of not collecting valid error info in the MCE error thresholding groups on F10h hardware. The current code contains a subtle problem of checking only the Valid bit of MSR0000_0413 (which is MC4_MISC0 - DRAM thresholding group) in its first iteration and breaking out if the bit is cleared. But (!), this MSR contains an offset value, BlkPtr[31:24], which points to the remaining MSRs in this thresholding group which might contain valid information too. But if we bail out only after we checked the valid bit in the first MSR and not the block pointer too, we miss that other information. The thing is, MC4_MISC0[BlkPtr] is not predicated on MCi_STATUS[MiscV] or MC4_MISC0[Valid] and should be checked prior to iterating over the MCI_MISCj thresholding group, irrespective of the MC4_MISC0[Valid] setting. Signed-off-by: Borislav Petkov Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5e975298fa81..39aaee5c1ab2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; + address += MCG_XBLK_ADDR; } else ++address; @@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (rdmsr_safe(address, &low, &high)) break; - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } + if (!(high & MASK_VALID_HI)) + continue; if (!(high & MASK_CNTP_HI) || (high & MASK_LOCKED_HI)) -- cgit v1.2.3 From 58877679fd393d3ef71aa383031ac7817561463d Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:18 -1000 Subject: KVM: x86: Fix SVM VMCB reset On reset, VMCB TSC should be set to zero. Instead, code was setting tsc_offset to zero, which passes through the underlying TSC. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bc5b9b8d4a33..12b502de1369 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -766,7 +766,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); - control->tsc_offset = 0; + control->tsc_offset = 0-native_read_tsc(); control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); -- cgit v1.2.3 From 47008cd887c1836bcadda123ba73e1863de7a6c4 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:19 -1000 Subject: KVM: x86: Move TSC reset out of vmcb_init The VMCB is reset whenever we receive a startup IPI, so Linux is setting TSC back to zero happens very late in the boot process and destabilizing the TSC. Instead, just set TSC to zero once at VCPU creation time. Why the separate patch? So git-bisect is your friend. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 12b502de1369..81ed28cb36e6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -766,7 +766,6 @@ static void init_vmcb(struct vcpu_svm *svm) control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); - control->tsc_offset = 0-native_read_tsc(); control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -902,6 +901,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); + svm->vmcb->control.tsc_offset = 0-native_read_tsc(); err = fx_init(&svm->vcpu); if (err) -- cgit v1.2.3 From 73cf624d029d776a33d0a80c695485b3f9b36231 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 10 Oct 2010 19:52:15 -0700 Subject: x86, numa: For each node, register the memory blocks actually used Russ reported SGI UV is broken recently. He said: | The SRAT table shows that memory range is spread over two nodes. | | SRAT: Node 0 PXM 0 100000000-800000000 | SRAT: Node 1 PXM 1 800000000-1000000000 | SRAT: Node 0 PXM 0 1000000000-1080000000 | |Previously, the kernel early_node_map[] would show three entries |with the proper node. | |[ 0.000000] 0: 0x00100000 -> 0x00800000 |[ 0.000000] 1: 0x00800000 -> 0x01000000 |[ 0.000000] 0: 0x01000000 -> 0x01080000 | |The problem is recent community kernel early_node_map[] shows |only two entries with the node 0 entry overlapping the node 1 |entry. | | 0: 0x00100000 -> 0x01080000 | 1: 0x00800000 -> 0x01000000 After looking at the changelog, Found out that it has been broken for a while by following commit |commit 8716273caef7f55f39fe4fc6c69c5f9f197f41f1 |Author: David Rientjes |Date: Fri Sep 25 15:20:04 2009 -0700 | | x86: Export srat physical topology Before that commit, register_active_regions() is called for every SRAT memory entry right away. Use nodememblk_range[] instead of nodes[] in order to make sure we capture the actual memory blocks registered with each node. nodes[] contains an extended range which spans all memory regions associated with a node, but that does not mean that all the memory in between are included. Reported-by: Russ Anderson Tested-by: Russ Anderson Signed-off-by: Yinghai Lu LKML-Reference: <4CB27BDF.5000800@kernel.org> Acked-by: David Rientjes Cc: 2.6.33 .34 .35 .36 Signed-off-by: H. Peter Anvin --- arch/x86/mm/srat_64.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index f9897f7a9ef1..9c0d0d399c30 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - for_each_node_mask(i, nodes_parsed) - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); + for (i = 0; i < num_node_memblks; i++) + e820_register_active_regions(memblk_nodeid[i], + node_memblk_range[i].start >> PAGE_SHIFT, + node_memblk_range[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(nodes)) { -- cgit v1.2.3 From b683de2b3cb17bb10fa6fd4af614dc75b5749fe0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 20:55:03 +0200 Subject: genirq: Query arch for number of early descriptors sparse irq sets up NR_IRQS_LEGACY irq descriptors and archs then go ahead and allocate more. Use the unused return value of arch_probe_nr_irqs() to let the architecture return the number of early allocations. Fix up all users. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f1efebaf5510..5aee1d1a306d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3880,7 +3880,7 @@ int __init arch_probe_nr_irqs(void) if (nr < nr_irqs) nr_irqs = nr; - return 0; + return NR_IRQS_LEGACY; } #endif -- cgit v1.2.3 From 1c9db52534a2c0e9776788cd34ccc193289fc18c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 16:46:51 +0200 Subject: pci: Convert msi to new irq_chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Jesse Barnes Cc: Benjamin Herrenschmidt Cc: "David S. Miller" Cc: Tony Luck Cc: Russell King --- arch/x86/kernel/apic/io_apic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7556eb7a1a47..b79938ff9bde 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3441,8 +3441,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) */ static struct irq_chip msi_chip = { .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, .ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_msi_irq_affinity, @@ -3452,8 +3452,8 @@ static struct irq_chip msi_chip = { static struct irq_chip msi_ir_chip = { .name = "IR-PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, #ifdef CONFIG_INTR_REMAP .ack = ir_ack_apic_edge, #ifdef CONFIG_SMP -- cgit v1.2.3 From 39431acb1a4c464e62471cb3058b8ffffb9244db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 19:09:51 +0200 Subject: pci: Cleanup the irq_desc mess in msi Handing down irq_desc to msi just so that msi can access irq_desc.irq_data.msi_desc is a pretty stupid idea. The calling code can hand down a pointer to msi_desc so msi code does not need to know about the irq descriptor at all. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b79938ff9bde..74bb027b517e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3383,14 +3383,14 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) cfg = desc->chip_data; - get_cached_msi_msg_desc(desc, &msg); + __get_cached_msi_msg(desc->irq_data.msi_desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - write_msi_msg_desc(desc, &msg); + __write_msi_msg(desc->irq_data.msi_desc, &msg); return 0; } -- cgit v1.2.3 From 011d578fdadb64bcc1deedbb02216bfee6a9b4dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 00:15:31 +0200 Subject: x86: Remove useless reinitialization of irq descriptors The descriptors are already initialized in exactly this way. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 990ae7cfc578..a91ab503e24f 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -100,6 +100,8 @@ int vector_used_by_percpu_irq(unsigned int vector) void __init init_ISA_irqs(void) { + struct irq_chip *chip = legacy_pic->chip; + const char *name = chip->name; int i; #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) @@ -107,19 +109,8 @@ void __init init_ISA_irqs(void) #endif legacy_pic->init(0); - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); } void __init init_IRQ(void) -- cgit v1.2.3 From a3c08e5d80c54e32423efbba113b02942c91f726 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 20:24:58 +0200 Subject: x86: Convert irq_chip access to new functions Before moving the irq chips to the new functions, fixup direct callers. The cpu offline irq fixup code needs to become generic and archs need to honour the "force" flag as an indicator, but that's for later. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/irq.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 91fd0c70a18a..d765bdc48074 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -159,7 +159,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, " %8s", desc->irq_data.chip->name); seq_printf(p, "-%-8s", desc->name); if (action) { @@ -282,6 +282,7 @@ void fixup_irqs(void) unsigned int irq, vector; static int warned; struct irq_desc *desc; + struct irq_data *data; for_each_irq_desc(irq, desc) { int break_affinity = 0; @@ -296,7 +297,8 @@ void fixup_irqs(void) /* interrupt's are disabled at this point */ raw_spin_lock(&desc->lock); - affinity = desc->affinity; + data = &desc->irq_data; + affinity = data->affinity; if (!irq_has_action(irq) || cpumask_equal(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); @@ -315,16 +317,16 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) - desc->chip->mask(irq); + if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) + data->chip->irq_mask(data); - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); + if (data->chip->irq_set_affinity) + data->chip->irq_set_affinity(data, affinity, true); else if (!(warned++)) set_affinity = 0; - if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) - desc->chip->unmask(irq); + if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) + data->chip->irq_unmask(data); raw_spin_unlock(&desc->lock); @@ -355,10 +357,10 @@ void fixup_irqs(void) if (irr & (1 << (vector % 32))) { irq = __get_cpu_var(vector_irq)[vector]; - desc = irq_to_desc(irq); + data = irq_get_irq_data(irq); raw_spin_lock(&desc->lock); - if (desc->chip->retrigger) - desc->chip->retrigger(irq); + if (data->chip->irq_retrigger) + data->chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } } -- cgit v1.2.3 From a5ef2e70405c8a9ee380b5ff33a008c75454791f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 11:11:10 +0200 Subject: x86: Sanitize apb timer interrupt handling Disable the interrupt in CPU_DEAD where it belongs. Remove the open coded irq_desc manipulation. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Jacob Pan --- arch/x86/kernel/apb_timer.c | 54 ++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 08f75fb4f509..42a70a2accc0 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -231,34 +231,6 @@ static void apbt_restart_clocksource(struct clocksource *cs) apbt_start_counter(phy_cs_timer_id); } -/* Setup IRQ routing via IOAPIC */ -#ifdef CONFIG_SMP -static void apbt_setup_irq(struct apbt_dev *adev) -{ - struct irq_chip *chip; - struct irq_desc *desc; - - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - desc = irq_to_desc(adev->irq); - chip = get_irq_chip(adev->irq); - disable_irq(adev->irq); - desc->status |= IRQ_MOVE_PCNTXT; - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); - /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ - set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); - enable_irq(adev->irq); - if (system_state == SYSTEM_BOOTING) - if (request_irq(adev->irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - adev->name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - adev->num); - } -} -#endif - static void apbt_enable_int(int n) { unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); @@ -334,6 +306,27 @@ static int __init apbt_clockevent_register(void) } #ifdef CONFIG_SMP + +static void apbt_setup_irq(struct apbt_dev *adev) +{ + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + + if (system_state == SYSTEM_BOOTING) { + irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); + /* APB timer irqs are set up as mp_irqs, timer is edge type */ + __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } + } else + enable_irq(adev->irq); +} + /* Should be called with per cpu */ void apbt_setup_secondary_clock(void) { @@ -389,10 +382,11 @@ static int apbt_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_DEAD: + disable_irq(adev->irq); apbt_disable_int(cpu); - if (system_state == SYSTEM_RUNNING) + if (system_state == SYSTEM_RUNNING) { pr_debug("skipping APBT CPU %lu offline\n", cpu); - else if (adev) { + } else if (adev) { pr_debug("APBT clockevent for cpu %lu offline\n", cpu); free_irq(adev->irq, adev); } -- cgit v1.2.3 From fe25c7fc2e036e1569faac8715a8aa5496cda78d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 14:57:24 +0200 Subject: x86: lguest: Convert to new irq chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Rusty Russell --- arch/x86/lguest/boot.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 9d5f55848455..2d4e6fcac836 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -791,22 +791,22 @@ static void lguest_flush_tlb_kernel(void) * simple as setting a bit. We don't actually "ack" interrupts as such, we * just mask and unmask them. I wonder if we should be cleverer? */ -static void disable_lguest_irq(unsigned int irq) +static void disable_lguest_irq(struct irq_data *data) { - set_bit(irq, lguest_data.blocked_interrupts); + set_bit(data->irq, lguest_data.blocked_interrupts); } -static void enable_lguest_irq(unsigned int irq) +static void enable_lguest_irq(struct irq_data *data) { - clear_bit(irq, lguest_data.blocked_interrupts); + clear_bit(data->irq, lguest_data.blocked_interrupts); } /* This structure describes the lguest IRQ controller. */ static struct irq_chip lguest_irq_controller = { .name = "lguest", - .mask = disable_lguest_irq, - .mask_ack = disable_lguest_irq, - .unmask = enable_lguest_irq, + .irq_mask = disable_lguest_irq, + .irq_mask_ack = disable_lguest_irq, + .irq_unmask = enable_lguest_irq, }; /* -- cgit v1.2.3 From 020dd984d7c0792e8234f2e4b4fb0534fe750f9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 14:59:58 +0200 Subject: x86: Cleanup visws interrupt handling Remove the open coded access to irq_desc and convert to the new irq chip functions. Change the mask function of piix4_virtual_irq_type so we can use the generic irq handling function for the virtual interrupt instead of open coding it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/visws_quirks.c | 140 +++++++++++++---------------------------- 1 file changed, 44 insertions(+), 96 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index e680ea52db9b..3371bd053b89 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -66,10 +66,7 @@ static void __init visws_time_init(void) } /* Replaces the default init_ISA_irqs in the generic setup */ -static void __init visws_pre_intr_init(void) -{ - init_VISWS_APIC_irqs(); -} +static void __init visws_pre_intr_init(void); /* Quirk for machine specific memory setup. */ @@ -429,67 +426,34 @@ static int is_co_apic(unsigned int irq) /* * This is the SGI Cobalt (IO-)APIC: */ - -static void enable_cobalt_irq(unsigned int irq) +static void enable_cobalt_irq(struct irq_data *data) { - co_apic_set(is_co_apic(irq), irq); + co_apic_set(is_co_apic(data->irq), data->irq); } -static void disable_cobalt_irq(unsigned int irq) +static void disable_cobalt_irq(struct irq_data *data) { - int entry = is_co_apic(irq); + int entry = is_co_apic(data->irq); co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); co_apic_read(CO_APIC_LO(entry)); } -/* - * "irq" really just serves to identify the device. Here is where we - * map this to the Cobalt APIC entry where it's physically wired. - * This is called via request_irq -> setup_irq -> irq_desc->startup() - */ -static unsigned int startup_cobalt_irq(unsigned int irq) +static void ack_cobalt_irq(struct irq_data *data) { unsigned long flags; - struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) - desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); - enable_cobalt_irq(irq); - spin_unlock_irqrestore(&cobalt_lock, flags); - return 0; -} - -static void ack_cobalt_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - disable_cobalt_irq(irq); + disable_cobalt_irq(data); apic_write(APIC_EOI, APIC_EIO_ACK); spin_unlock_irqrestore(&cobalt_lock, flags); } -static void end_cobalt_irq(unsigned int irq) -{ - unsigned long flags; - struct irq_desc *desc = irq_to_desc(irq); - - spin_lock_irqsave(&cobalt_lock, flags); - if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS))) - enable_cobalt_irq(irq); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - static struct irq_chip cobalt_irq_type = { - .name = "Cobalt-APIC", - .startup = startup_cobalt_irq, - .shutdown = disable_cobalt_irq, - .enable = enable_cobalt_irq, - .disable = disable_cobalt_irq, - .ack = ack_cobalt_irq, - .end = end_cobalt_irq, + .name = "Cobalt-APIC", + .irq_enable = enable_cobalt_irq, + .irq_disable = disable_cobalt_irq, + .irq_ack = ack_cobalt_irq, }; @@ -503,35 +467,34 @@ static struct irq_chip cobalt_irq_type = { * interrupt controller type, and through a special virtual interrupt- * controller. Device drivers only see the virtual interrupt sources. */ -static unsigned int startup_piix4_master_irq(unsigned int irq) +static unsigned int startup_piix4_master_irq(struct irq_data *data) { legacy_pic->init(0); - - return startup_cobalt_irq(irq); + enable_cobalt_irq(data); } -static void end_piix4_master_irq(unsigned int irq) +static void end_piix4_master_irq(struct irq_data *data) { unsigned long flags; spin_lock_irqsave(&cobalt_lock, flags); - enable_cobalt_irq(irq); + enable_cobalt_irq(data); spin_unlock_irqrestore(&cobalt_lock, flags); } static struct irq_chip piix4_master_irq_type = { - .name = "PIIX4-master", - .startup = startup_piix4_master_irq, - .ack = ack_cobalt_irq, - .end = end_piix4_master_irq, + .name = "PIIX4-master", + .irq_startup = startup_piix4_master_irq, + .irq_ack = ack_cobalt_irq, }; +static void pii4_mask(struct irq_data *data) { } static struct irq_chip piix4_virtual_irq_type = { - .name = "PIIX4-virtual", + .name = "PIIX4-virtual", + .mask = pii4_mask, }; - /* * PIIX4-8259 master/virtual functions to handle interrupt requests * from legacy devices: floppy, parallel, serial, rtc. @@ -549,9 +512,8 @@ static struct irq_chip piix4_virtual_irq_type = { */ static irqreturn_t piix4_master_intr(int irq, void *dev_id) { - int realirq; - struct irq_desc *desc; unsigned long flags; + int realirq; raw_spin_lock_irqsave(&i8259A_lock, flags); @@ -592,18 +554,10 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) raw_spin_unlock_irqrestore(&i8259A_lock, flags); - desc = irq_to_desc(realirq); - /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_incr_irqs_this_cpu(realirq, desc); - - if (likely(desc->action != NULL)) - handle_IRQ_event(realirq, desc->action); - - if (!(desc->status & IRQ_DISABLED)) - legacy_pic->chip->unmask(realirq); + generic_handle_irq(realirq); return IRQ_HANDLED; @@ -624,41 +578,35 @@ static struct irqaction cascade_action = { static inline void set_piix4_virtual_irq_type(void) { - piix4_virtual_irq_type.shutdown = i8259A_chip.mask; piix4_virtual_irq_type.enable = i8259A_chip.unmask; piix4_virtual_irq_type.disable = i8259A_chip.mask; + piix4_virtual_irq_type.unmask = i8259A_chip.unmask; } -void init_VISWS_APIC_irqs(void) +static void __init visws_pre_intr_init(void) { int i; - for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = 0; - desc->depth = 1; + set_piix4_virtual_irq_type(); - if (i == 0) { - desc->chip = &cobalt_irq_type; - } - else if (i == CO_IRQ_IDE0) { - desc->chip = &cobalt_irq_type; - } - else if (i == CO_IRQ_IDE1) { - desc->chip = &cobalt_irq_type; - } - else if (i == CO_IRQ_8259) { - desc->chip = &piix4_master_irq_type; - } - else if (i < CO_IRQ_APIC0) { - set_piix4_virtual_irq_type(); - desc->chip = &piix4_virtual_irq_type; - } - else if (IS_CO_APIC(i)) { - desc->chip = &cobalt_irq_type; - } + for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { + struct irq_chip *chip = NULL; + + if (i == 0) + chip = &cobalt_irq_type; + else if (i == CO_IRQ_IDE0) + chip = &cobalt_irq_type; + else if (i == CO_IRQ_IDE1) + >chip = &cobalt_irq_type; + else if (i == CO_IRQ_8259) + chip = &piix4_master_irq_type; + else if (i < CO_IRQ_APIC0) + chip = &piix4_virtual_irq_type; + else if (IS_CO_APIC(i)) + chip = &cobalt_irq_type; + + if (chip) + set_irq_chip(i, chip); } setup_irq(CO_IRQ_8259, &master_action); -- cgit v1.2.3 From 4305df947ca1fd52867c8d56837a4e6b1e33167c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 15:01:33 +0200 Subject: x86: i8259: Convert to new irq_chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/include/asm/i8259.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 20 +++++++------- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/i8259.c | 63 +++++++++++++++++++++--------------------- arch/x86/kernel/smpboot.c | 4 +-- 5 files changed, 47 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 1655147646aa..a20365953bf8 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip; struct legacy_pic { int nr_legacy_irqs; struct irq_chip *chip; + void (*mask)(unsigned int irq); + void (*unmask)(unsigned int irq); void (*mask_all)(void); void (*restore_mask)(void); void (*init)(int auto_eoi); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 74bb027b517e..e5ae2a222620 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1459,7 +1459,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq ioapic_register_intr(irq, desc, trigger); if (irq < legacy_pic->nr_legacy_irqs) - legacy_pic->chip->mask(irq); + legacy_pic->mask(irq); ioapic_write_entry(apic_id, pin, entry); } @@ -2233,7 +2233,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) raw_spin_lock_irqsave(&ioapic_lock, flags); if (irq < legacy_pic->nr_legacy_irqs) { - legacy_pic->chip->mask(irq); + legacy_pic->mask(irq); if (legacy_pic->irq_pending(irq)) was_pending = 1; } @@ -2928,7 +2928,7 @@ static inline void __init check_timer(void) /* * get/set the timer IRQ vector: */ - legacy_pic->chip->mask(0); + legacy_pic->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -3000,7 +3000,7 @@ static inline void __init check_timer(void) if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -3023,14 +3023,14 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->chip->mask(0); + legacy_pic->mask(0); setup_nmi(); - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); } goto out; } @@ -3038,7 +3038,7 @@ static inline void __init check_timer(void) * Cleanup, just in case ... */ local_irq_disable(); - legacy_pic->chip->mask(0); + legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -3057,14 +3057,14 @@ static inline void __init check_timer(void) lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - legacy_pic->chip->mask(0); + legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index a43f71cb30f8..c90041ccb742 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -178,7 +178,7 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC) { if (!timer_through_8259) - legacy_pic->chip->mask(0); + legacy_pic->mask(0); on_each_cpu(__acpi_nmi_disable, NULL, 1); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index cafa7c80ac95..20757cb2efa3 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -29,24 +29,10 @@ * plus some generic x86 specific things if generic specifics makes * any sense at all. */ +static void init_8259A(int auto_eoi); static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); -static void mask_8259A(void); -static void unmask_8259A(void); -static void disable_8259A_irq(unsigned int irq); -static void enable_8259A_irq(unsigned int irq); -static void init_8259A(int auto_eoi); -static int i8259A_irq_pending(unsigned int irq); - -struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; /* * 8259A PIC functions to handle ISA devices: @@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff; */ unsigned long io_apic_irqs; -static void disable_8259A_irq(unsigned int irq) +static void mask_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; @@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -static void enable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(struct irq_data *data) +{ + mask_8259A_irq(data->irq); +} + +static void unmask_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; @@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } +static void enable_8259A_irq(struct irq_data *data) +{ + unmask_8259A_irq(data->irq); +} + static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<irq; unsigned int irqmask = 1 << irq; unsigned long flags; @@ -223,6 +220,14 @@ spurious_8259A_irq: } } +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .irq_mask = disable_8259A_irq, + .irq_disable = disable_8259A_irq, + .irq_unmask = enable_8259A_irq, + .irq_mask_ack = mask_and_ack_8259A, +}; + static char irq_trigger[2]; /** * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ @@ -342,9 +347,9 @@ static void init_8259A(int auto_eoi) * In AEOI mode we just have to mask the interrupt * when acking. */ - i8259A_chip.mask_ack = disable_8259A_irq; + i8259A_chip.irq_mask_ack = disable_8259A_irq; else - i8259A_chip.mask_ack = mask_and_ack_8259A; + i8259A_chip.irq_mask_ack = mask_and_ack_8259A; udelay(100); /* wait for 8259A to initialize */ @@ -363,14 +368,6 @@ static void init_8259A(int auto_eoi) static void legacy_pic_noop(void) { }; static void legacy_pic_uint_noop(unsigned int unused) { }; static void legacy_pic_int_noop(int unused) { }; - -static struct irq_chip dummy_pic_chip = { - .name = "dummy pic", - .mask = legacy_pic_uint_noop, - .unmask = legacy_pic_uint_noop, - .disable = legacy_pic_uint_noop, - .mask_ack = legacy_pic_uint_noop, -}; static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; @@ -378,7 +375,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq) struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, - .chip = &dummy_pic_chip, + .chip = &dummy_irq_chip, + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, @@ -389,7 +388,9 @@ struct legacy_pic null_legacy_pic = { struct legacy_pic default_legacy_pic = { .nr_legacy_irqs = NR_IRQS_LEGACY, .chip = &i8259A_chip, - .mask_all = mask_8259A, + .mask = mask_8259A_irq, + .unmask = unmask_8259A_irq, + .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, .irq_pending = i8259A_irq_pending, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 87a8c6b00f8d..864b386f6c0e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -324,9 +324,9 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->chip->mask(0); + legacy_pic->mask(0); enable_NMI_through_LVT0(); - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); } /* This must be done before setting cpu_online_mask */ -- cgit v1.2.3 From d4eba29770244e7cc5e60c0977d73d84148a3d6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Sep 2010 12:26:18 +0200 Subject: x86: Cleanup access to irq_data Fixup the open coded access to irq_desc->[handler_data|chip_data|msi-desc] Use the macros and inline functions for it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 63 +++++++++++++++++++++--------------------- arch/x86/kernel/uv_irq.c | 2 +- 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e5ae2a222620..fa0d92a6db59 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -190,7 +190,7 @@ struct irq_cfg *irq_cfg(unsigned int irq) desc = irq_to_desc(irq); if (desc) - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); return cfg; } @@ -219,10 +219,11 @@ int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(node); - if (!desc->chip_data) { + cfg = get_one_free_irq_cfg(node); + desc->chip_data = cfg; + if (!cfg) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); } @@ -325,8 +326,8 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) { struct irq_cfg *old_cfg, *cfg; - old_cfg = old_desc->chip_data; - cfg = desc->chip_data; + old_cfg = get_irq_desc_chip_data(old_desc); + cfg = get_irq_desc_chip_data(desc); if (old_cfg == cfg) return; @@ -594,7 +595,7 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); unsigned long flags; BUG_ON(!cfg); @@ -606,7 +607,7 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) { - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); @@ -1269,7 +1270,7 @@ void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); /* * If it is a legacy IRQ handled by the legacy PIC, this cpu @@ -1427,7 +1428,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq if (!IO_APIC_IRQ(irq)) return; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); /* * For legacy irqs, cfg->domain starts with cpu 0 for legacy @@ -1516,7 +1517,7 @@ static void __init setup_IO_APIC_irqs(void) printk(KERN_INFO "can not get irq_desc for %d\n", irq); continue; } - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); add_pin_to_irq_node(cfg, node, apic_id, pin); /* * don't mark it in pin_programmed, so later acpi could @@ -1567,7 +1568,7 @@ void setup_IO_APIC_irq_extra(u32 gsi) return; } - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); add_pin_to_irq_node(cfg, node, apic_id, pin); if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { @@ -1718,7 +1719,7 @@ __apicdebuginit(void) print_IO_APIC(void) for_each_irq_desc(irq, desc) { struct irq_pin_list *entry; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -2323,7 +2324,7 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, return -1; irq = desc->irq; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); if (assign_irq_vector(irq, cfg, mask)) return -1; @@ -2343,7 +2344,7 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) int ret = -1; irq = desc->irq; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); raw_spin_lock_irqsave(&ioapic_lock, flags); ret = set_desc_affinity(desc, mask, &dest); @@ -2396,7 +2397,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) if (get_irte(irq, &irte)) return ret; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); if (assign_irq_vector(irq, cfg, mask)) return ret; @@ -2500,7 +2501,7 @@ unlock: static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); unsigned me; if (likely(!cfg->move_in_progress)) @@ -2520,7 +2521,7 @@ static void irq_complete_move(struct irq_desc **descp) void irq_force_complete_move(int irq) { struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); if (!cfg) return; @@ -2588,7 +2589,7 @@ static void eoi_ioapic_irq(struct irq_desc *desc) unsigned int irq; irq = desc->irq; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); raw_spin_lock_irqsave(&ioapic_lock, flags); __eoi_ioapic_irq(irq, cfg); @@ -2644,7 +2645,7 @@ static void ack_apic_level(unsigned int irq) * we use the above logic (mask+edge followed by unmask+level) from * Manfred Spraul to clear the remote IRR. */ - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); i = cfg->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -2695,7 +2696,7 @@ static void ack_apic_level(unsigned int irq) * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); if (!io_apic_level_ack_pending(cfg)) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); @@ -2763,7 +2764,7 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_irq_desc(irq, desc) { - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, @@ -2917,7 +2918,7 @@ int timer_through_8259 __initdata; static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -3250,13 +3251,13 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; } - cfg_new = desc_new->chip_data; + cfg_new = get_irq_desc_chip_data(desc_new); if (cfg_new->vector != 0) continue; desc_new = move_irq_desc(desc_new, node); - cfg_new = desc_new->chip_data; + cfg_new = get_irq_desc_chip_data(desc_new); if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; @@ -3381,7 +3382,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) if (set_desc_affinity(desc, mask, &dest)) return -1; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); __get_cached_msi_msg(desc->irq_data.msi_desc, &msg); @@ -3403,7 +3404,7 @@ static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); unsigned int dest; struct irte irte; @@ -3595,7 +3596,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) if (set_desc_affinity(desc, mask, &dest)) return -1; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); dmar_msi_read(irq, &msg); @@ -3650,7 +3651,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) if (set_desc_affinity(desc, mask, &dest)) return -1; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); hpet_msi_read(irq, &msg); @@ -3756,7 +3757,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) if (set_desc_affinity(desc, mask, &dest)) return -1; - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); target_ht_irq(irq, dest, cfg->vector); @@ -3903,7 +3904,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, * IRQs < 16 are already in the irq_2_pin[] map */ if (irq >= legacy_pic->nr_legacy_irqs) { - cfg = desc->chip_data; + cfg = get_irq_desc_chip_data(desc); if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", pin, irq); diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 1132129db792..2233a42fb907 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -209,7 +209,7 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); unsigned int dest; unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; -- cgit v1.2.3 From dd5f15e5cf104c9170b76ae3274f35b42a3e4161 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 15:18:35 +0200 Subject: x86: Cleanup io_apic Sanitize functions. Remove irq_desc pointer magic. Preparatory patch for further cleanups. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 109 ++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 67 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fa0d92a6db59..3c4dee8a9ef7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -572,11 +572,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); -} - static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -588,44 +583,41 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) +static void mask_ioapic(struct irq_cfg *cfg) { + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_IO_APIC_irq_desc(struct irq_desc *desc) +static void mask_ioapic_irq(unsigned int irq) { - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); - unsigned long flags; + struct irq_cfg *cfg = get_irq_chip_data(irq); - BUG_ON(!cfg); + mask_ioapic(cfg); +} - raw_spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(cfg); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); +static void __unmask_ioapic(struct irq_cfg *cfg) +{ + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) +static void unmask_ioapic(struct irq_cfg *cfg) { - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(cfg); + __unmask_ioapic(cfg); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_IO_APIC_irq(unsigned int irq) +static void unmask_ioapic_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = get_irq_chip_data(irq); - mask_IO_APIC_irq_desc(desc); -} -static void unmask_IO_APIC_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - unmask_IO_APIC_irq_desc(desc); + unmask_ioapic(cfg); } static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) @@ -2239,7 +2231,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) was_pending = 1; } cfg = irq_cfg(irq); - __unmask_IO_APIC_irq(cfg); + __unmask_ioapic(cfg); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2498,10 +2490,8 @@ unlock: irq_exit(); } -static void __irq_complete_move(struct irq_desc **descp, unsigned vector) +static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { - struct irq_desc *desc = *descp; - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); unsigned me; if (likely(!cfg->move_in_progress)) @@ -2513,30 +2503,29 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector) send_cleanup_vector(cfg); } -static void irq_complete_move(struct irq_desc **descp) +static void irq_complete_move(struct irq_cfg *cfg) { - __irq_complete_move(descp, ~get_irq_regs()->orig_ax); + __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); } void irq_force_complete_move(int irq) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); + struct irq_cfg *cfg = get_irq_chip_data(irq); if (!cfg) return; - __irq_complete_move(&desc, cfg->vector); + __irq_complete_move(cfg, cfg->vector); } #else -static inline void irq_complete_move(struct irq_desc **descp) {} +static inline void irq_complete_move(struct irq_cfg *cfg) { } #endif static void ack_apic_edge(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = get_irq_chip_data(irq); - irq_complete_move(&desc); + irq_complete_move(cfg); move_native_irq(irq); ack_APIC_irq(); } @@ -2559,10 +2548,12 @@ atomic_t irq_mis_count; * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; + unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { if (mp_ioapics[entry->apic].apicver >= 0x20) { /* @@ -2580,36 +2571,22 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) __unmask_and_level_IO_APIC_irq(entry); } } -} - -static void eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = get_irq_desc_chip_data(desc); - - raw_spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = get_irq_desc_chip_data(desc); + int i, do_unmask_irq = 0; unsigned long v; - int i; - struct irq_cfg *cfg; - int do_unmask_irq = 0; - irq_complete_move(&desc); + irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ if (unlikely(desc->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; - mask_IO_APIC_irq_desc(desc); + mask_ioapic(cfg); } #endif @@ -2645,7 +2622,6 @@ static void ack_apic_level(unsigned int irq) * we use the above logic (mask+edge followed by unmask+level) from * Manfred Spraul to clear the remote IRR. */ - cfg = get_irq_desc_chip_data(desc); i = cfg->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -2665,7 +2641,7 @@ static void ack_apic_level(unsigned int irq) if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - eoi_ioapic_irq(desc); + eoi_ioapic_irq(irq, cfg); } /* Now we can move and renable the irq */ @@ -2696,10 +2672,9 @@ static void ack_apic_level(unsigned int irq) * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - cfg = get_irq_desc_chip_data(desc); if (!io_apic_level_ack_pending(cfg)) move_masked_irq(irq); - unmask_IO_APIC_irq_desc(desc); + unmask_ioapic(cfg); } } @@ -2711,18 +2686,18 @@ static void ir_ack_apic_edge(unsigned int irq) static void ir_ack_apic_level(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = get_irq_chip_data(irq); ack_APIC_irq(); - eoi_ioapic_irq(desc); + eoi_ioapic_irq(irq, cfg); } #endif /* CONFIG_INTR_REMAP */ static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, + .mask = mask_ioapic_irq, + .unmask = unmask_ioapic_irq, .ack = ack_apic_edge, .eoi = ack_apic_level, #ifdef CONFIG_SMP @@ -2734,8 +2709,8 @@ static struct irq_chip ioapic_chip __read_mostly = { static struct irq_chip ir_ioapic_chip __read_mostly = { .name = "IR-IO-APIC", .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, + .mask = mask_ioapic_irq, + .unmask = unmask_ioapic_irq, #ifdef CONFIG_INTR_REMAP .ack = ir_ack_apic_edge, .eoi = ir_ack_apic_level, @@ -2996,7 +2971,7 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_IO_APIC_irq_desc(desc); + unmask_ioapic(cfg); } if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { -- cgit v1.2.3 From 61a38ce3f59cdb4654e9444329195bd57c3baf74 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 16:00:34 +0200 Subject: x86: io_apic: Convert startup to new irq_chip function Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c4dee8a9ef7..5ced690b8496 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2218,11 +2218,10 @@ static int __init timer_irq_works(void) * an edge even if it isn't on the 8259A... */ -static unsigned int startup_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(struct irq_data *data) { - int was_pending = 0; + int was_pending = 0, irq = data->irq; unsigned long flags; - struct irq_cfg *cfg; raw_spin_lock_irqsave(&ioapic_lock, flags); if (irq < legacy_pic->nr_legacy_irqs) { @@ -2230,8 +2229,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) if (legacy_pic->irq_pending(irq)) was_pending = 1; } - cfg = irq_cfg(irq); - __unmask_ioapic(cfg); + __unmask_ioapic(data->chip_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2695,7 +2693,7 @@ static void ir_ack_apic_level(unsigned int irq) static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", - .startup = startup_ioapic_irq, + .irq_startup = startup_ioapic_irq, .mask = mask_ioapic_irq, .unmask = unmask_ioapic_irq, .ack = ack_apic_edge, @@ -2708,7 +2706,7 @@ static struct irq_chip ioapic_chip __read_mostly = { static struct irq_chip ir_ioapic_chip __read_mostly = { .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, + .irq_startup = startup_ioapic_irq, .mask = mask_ioapic_irq, .unmask = unmask_ioapic_irq, #ifdef CONFIG_INTR_REMAP -- cgit v1.2.3 From 90297c5fe71d32a2a0ead38bd8f6b1112a2e5ac0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 16:03:54 +0200 Subject: x86: ioapic: Convert mask to new irq_chip function Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 95 +++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5ced690b8496..b8b013f0cfdd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -592,11 +592,9 @@ static void mask_ioapic(struct irq_cfg *cfg) raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_ioapic_irq(unsigned int irq) +static void mask_ioapic_irq(struct irq_data *data) { - struct irq_cfg *cfg = get_irq_chip_data(irq); - - mask_ioapic(cfg); + mask_ioapic(data->chip_data); } static void __unmask_ioapic(struct irq_cfg *cfg) @@ -613,11 +611,9 @@ static void unmask_ioapic(struct irq_cfg *cfg) raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_ioapic_irq(unsigned int irq) +static void unmask_ioapic_irq(struct irq_data *data) { - struct irq_cfg *cfg = get_irq_chip_data(irq); - - unmask_ioapic(cfg); + unmask_ioapic(data->chip_data); } static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) @@ -2235,10 +2231,9 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) return was_pending; } -static int ioapic_retrigger_irq(unsigned int irq) +static int ioapic_retrigger_irq(struct irq_data *data) { - - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_cfg *cfg = data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); @@ -2519,12 +2514,10 @@ void irq_force_complete_move(int irq) static inline void irq_complete_move(struct irq_cfg *cfg) { } #endif -static void ack_apic_edge(unsigned int irq) +static void ack_apic_edge(struct irq_data *data) { - struct irq_cfg *cfg = get_irq_chip_data(irq); - - irq_complete_move(cfg); - move_native_irq(irq); + irq_complete_move(data->chip_data); + move_native_irq(data->irq); ack_APIC_irq(); } @@ -2572,11 +2565,11 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void ack_apic_level(unsigned int irq) +static void ack_apic_level(struct irq_data *data) { + struct irq_cfg *cfg = data->chip_data; + int i, do_unmask_irq = 0, irq = data->irq; struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); - int i, do_unmask_irq = 0; unsigned long v; irq_complete_move(cfg); @@ -2677,46 +2670,44 @@ static void ack_apic_level(unsigned int irq) } #ifdef CONFIG_INTR_REMAP -static void ir_ack_apic_edge(unsigned int irq) +static void ir_ack_apic_edge(struct irq_data *data) { ack_APIC_irq(); } -static void ir_ack_apic_level(unsigned int irq) +static void ir_ack_apic_level(struct irq_data *data) { - struct irq_cfg *cfg = get_irq_chip_data(irq); - ack_APIC_irq(); - eoi_ioapic_irq(irq, cfg); + eoi_ioapic_irq(data->irq, data->chip_data); } #endif /* CONFIG_INTR_REMAP */ static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .irq_startup = startup_ioapic_irq, - .mask = mask_ioapic_irq, - .unmask = unmask_ioapic_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = ack_apic_edge, + .irq_eoi = ack_apic_level, #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity_irq, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip ir_ioapic_chip __read_mostly = { .name = "IR-IO-APIC", .irq_startup = startup_ioapic_irq, - .mask = mask_ioapic_irq, - .unmask = unmask_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, #ifdef CONFIG_INTR_REMAP - .ack = ir_ack_apic_edge, - .eoi = ir_ack_apic_level, + .irq_ack = ir_ack_apic_edge, + .irq_eoi = ir_ack_apic_level, #ifdef CONFIG_SMP .set_affinity = set_ir_ioapic_affinity_irq, #endif #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static inline void init_IO_APIC_traps(void) @@ -2757,7 +2748,7 @@ static inline void init_IO_APIC_traps(void) * The local APIC irq-chip implementation: */ -static void mask_lapic_irq(unsigned int irq) +static void mask_lapic_irq(struct irq_data *data) { unsigned long v; @@ -2765,7 +2756,7 @@ static void mask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } -static void unmask_lapic_irq(unsigned int irq) +static void unmask_lapic_irq(struct irq_data *data) { unsigned long v; @@ -2773,16 +2764,16 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq(unsigned int irq) +static void ack_lapic_irq(struct irq_data *data) { ack_APIC_irq(); } static struct irq_chip lapic_chip __read_mostly = { .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, + .irq_mask = mask_lapic_irq, + .irq_unmask = unmask_lapic_irq, + .irq_ack = ack_lapic_irq, }; static void lapic_register_intr(int irq, struct irq_desc *desc) @@ -3417,11 +3408,11 @@ static struct irq_chip msi_chip = { .name = "PCI-MSI", .irq_unmask = unmask_msi_irq, .irq_mask = mask_msi_irq, - .ack = ack_apic_edge, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_msi_irq_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip msi_ir_chip = { @@ -3429,12 +3420,12 @@ static struct irq_chip msi_ir_chip = { .irq_unmask = unmask_msi_irq, .irq_mask = mask_msi_irq, #ifdef CONFIG_INTR_REMAP - .ack = ir_ack_apic_edge, + .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = ir_set_msi_irq_affinity, #endif #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; /* @@ -3589,11 +3580,11 @@ static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .unmask = dmar_msi_unmask, .mask = dmar_msi_mask, - .ack = ack_apic_edge, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = dmar_msi_set_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_dmar_msi(unsigned int irq) @@ -3645,23 +3636,23 @@ static struct irq_chip ir_hpet_msi_type = { .unmask = hpet_msi_unmask, .mask = hpet_msi_mask, #ifdef CONFIG_INTR_REMAP - .ack = ir_ack_apic_edge, + .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = ir_set_msi_irq_affinity, #endif #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, .mask = hpet_msi_mask, - .ack = ack_apic_edge, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = hpet_msi_set_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_hpet_msi(unsigned int irq, unsigned int id) @@ -3743,11 +3734,11 @@ static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .mask = mask_ht_irq, .unmask = unmask_ht_irq, - .ack = ack_apic_edge, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_ht_irq_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -- cgit v1.2.3 From d0fbca8f9304d1760fdc906b35b06e89fbdbb51f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 16:18:39 +0200 Subject: x86: ioapic/hpet: Convert to new chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/include/asm/hpet.h | 10 ++++++---- arch/x86/kernel/apic/io_apic.c | 30 ++++++++++++++---------------- arch/x86/kernel/hpet.c | 16 ++++++---------- 3 files changed, 26 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 1d5c08a1bdfd..2c392d663dce 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,10 +74,12 @@ extern void hpet_disable(void); extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); -extern void hpet_msi_unmask(unsigned int irq); -extern void hpet_msi_mask(unsigned int irq); -extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); -extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); +struct irq_data; +extern void hpet_msi_unmask(struct irq_data *data); +extern void hpet_msi_mask(struct irq_data *data); +struct hpet_dev; +extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); +extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); #ifdef CONFIG_PCI_MSI extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b8b013f0cfdd..49aa857ff004 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3605,26 +3605,25 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_desc *desc = irq_to_desc(data->irq); + struct irq_cfg *cfg = data->chip_data; struct msi_msg msg; unsigned int dest; if (set_desc_affinity(desc, mask, &dest)) return -1; - cfg = get_irq_desc_chip_data(desc); - - hpet_msi_read(irq, &msg); + hpet_msi_read(data->handler_data, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - hpet_msi_write(irq, &msg); + hpet_msi_write(data->handler_data, &msg); return 0; } @@ -3633,8 +3632,8 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) static struct irq_chip ir_hpet_msi_type = { .name = "IR-HPET_MSI", - .unmask = hpet_msi_unmask, - .mask = hpet_msi_mask, + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, #ifdef CONFIG_INTR_REMAP .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP @@ -3646,20 +3645,19 @@ static struct irq_chip ir_hpet_msi_type = { static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", - .unmask = hpet_msi_unmask, - .mask = hpet_msi_mask, + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = hpet_msi_set_affinity, + .irq_set_affinity = hpet_msi_set_affinity, #endif .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { - int ret; struct msi_msg msg; - struct irq_desc *desc = irq_to_desc(irq); + int ret; if (intr_remapping_enabled) { struct intel_iommu *iommu = map_hpet_to_ir(id); @@ -3677,8 +3675,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) if (ret < 0) return ret; - hpet_msi_write(irq, &msg); - desc->status |= IRQ_MOVE_PCNTXT; + hpet_msi_write(get_irq_data(irq), &msg); + irq_set_status_flags(irq,IRQ_MOVE_PCNTXT); if (irq_remapped(irq)) set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, handle_edge_irq, "edge"); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7494999141b3..efaf906daf93 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -440,9 +440,9 @@ static int hpet_legacy_next_event(unsigned long delta, static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); static struct hpet_dev *hpet_devs; -void hpet_msi_unmask(unsigned int irq) +void hpet_msi_unmask(struct irq_data *data) { - struct hpet_dev *hdev = get_irq_data(irq); + struct hpet_dev *hdev = data->handler_data; unsigned int cfg; /* unmask it */ @@ -451,10 +451,10 @@ void hpet_msi_unmask(unsigned int irq) hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } -void hpet_msi_mask(unsigned int irq) +void hpet_msi_mask(struct irq_data *data) { + struct hpet_dev *hdev = data->handler_data; unsigned int cfg; - struct hpet_dev *hdev = get_irq_data(irq); /* mask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); @@ -462,18 +462,14 @@ void hpet_msi_mask(unsigned int irq) hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } -void hpet_msi_write(unsigned int irq, struct msi_msg *msg) +void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg) { - struct hpet_dev *hdev = get_irq_data(irq); - hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); } -void hpet_msi_read(unsigned int irq, struct msi_msg *msg) +void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg) { - struct hpet_dev *hdev = get_irq_data(irq); - msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); msg->address_hi = 0; -- cgit v1.2.3 From 5c2837fbaa609e615ef9a1c58a4cd26ce90be35b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 17:15:11 +0200 Subject: dmar: Convert to new irq chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: David Woodhouse --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 49aa857ff004..72b253ef310e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3578,8 +3578,8 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", - .unmask = dmar_msi_unmask, - .mask = dmar_msi_mask, + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = dmar_msi_set_affinity, -- cgit v1.2.3 From e9f7ac664bfc36685a8eb3315ec21c067d0cee36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 17:22:09 +0200 Subject: ht: Convert to new irq_chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 72b253ef310e..5579f3f5943a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3730,8 +3730,8 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) static struct irq_chip ht_irq_chip = { .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, + .irq_mask = mask_ht_irq, + .irq_unmask = unmask_ht_irq, .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_ht_irq_affinity, -- cgit v1.2.3 From 60c69948e5b6357ac0d5ef2a2d0ce31c173c3c64 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 17:28:38 +0200 Subject: x86: ioapic: Clean up the direct access to irq_desc Most of it is useless pseudo optimization. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 79 +++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5579f3f5943a..82c3c66e333f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -150,10 +150,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS]; int __init arch_early_irq_init(void) { struct irq_cfg *cfg; - struct irq_desc *desc; - int count; - int node; - int i; + int count, node, i; if (!legacy_pic->nr_legacy_irqs) { nr_irqs_gsi = 0; @@ -165,8 +162,7 @@ int __init arch_early_irq_init(void) node = cpu_to_node(0); for (i = 0; i < count; i++) { - desc = irq_to_desc(i); - desc->chip_data = &cfg[i]; + set_irq_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); /* @@ -185,14 +181,7 @@ int __init arch_early_irq_init(void) #ifdef CONFIG_SPARSE_IRQ struct irq_cfg *irq_cfg(unsigned int irq) { - struct irq_cfg *cfg = NULL; - struct irq_desc *desc; - - desc = irq_to_desc(irq); - if (desc) - cfg = get_irq_desc_chip_data(desc); - - return cfg; + return get_irq_chip_data(irq); } static struct irq_cfg *get_one_free_irq_cfg(int node) @@ -1316,17 +1305,17 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) +static void ioapic_register_intr(unsigned int irq, unsigned long trigger) { if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) - desc->status |= IRQ_LEVEL; + irq_set_status_flags(irq, IRQ_LEVEL); else - desc->status &= ~IRQ_LEVEL; + irq_clear_status_flags(irq, IRQ_LEVEL); if (irq_remapped(irq)) { - desc->status |= IRQ_MOVE_PCNTXT; + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); if (trigger) set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, handle_fasteoi_irq, @@ -1406,18 +1395,14 @@ int setup_ioapic_entry(int apic_id, int irq, return 0; } -static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc, - int trigger, int polarity) +static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, + struct irq_cfg *cfg, int trigger, int polarity) { - struct irq_cfg *cfg; struct IO_APIC_route_entry entry; unsigned int dest; if (!IO_APIC_IRQ(irq)) return; - - cfg = get_irq_desc_chip_data(desc); - /* * For legacy irqs, cfg->domain starts with cpu 0 for legacy * controllers like 8259. Now that IO-APIC can handle this irq, update @@ -1446,7 +1431,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq return; } - ioapic_register_intr(irq, desc, trigger); + ioapic_register_intr(irq, trigger); if (irq < legacy_pic->nr_legacy_irqs) legacy_pic->mask(irq); @@ -1511,8 +1496,8 @@ static void __init setup_IO_APIC_irqs(void) * don't mark it in pin_programmed, so later acpi could * set it correctly when irq < 16 */ - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), + irq_polarity(idx)); } if (notcon) @@ -1566,7 +1551,7 @@ void setup_IO_APIC_irq_extra(u32 gsi) } set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); - setup_IO_APIC_irq(apic_id, pin, irq, desc, + setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), irq_polarity(idx)); } @@ -2776,9 +2761,9 @@ static struct irq_chip lapic_chip __read_mostly = { .irq_ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq, struct irq_desc *desc) +static void lapic_register_intr(int irq) { - desc->status &= ~IRQ_LEVEL; + irq_clear_status_flags(irq, IRQ_LEVEL); set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -2881,8 +2866,7 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_desc *desc = irq_to_desc(0); - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); + struct irq_cfg *cfg = get_irq_chip_data(0); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2952,7 +2936,7 @@ static inline void __init check_timer(void) add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { - /* for edge trigger, setup_IO_APIC_irq already + /* for edge trigger, setup_ioapic_irq already * leave it unmasked. * so only need to unmask if it is level-trigger * do we really have level trigger timer? @@ -3020,7 +3004,7 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0, desc); + lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ legacy_pic->unmask(0); @@ -3457,8 +3441,8 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { - int ret; struct msi_msg msg; + int ret; ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) @@ -3468,11 +3452,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) write_msi_msg(irq, &msg); if (irq_remapped(irq)) { - struct irq_desc *desc = irq_to_desc(irq); - /* - * irq migration in process context - */ - desc->status |= IRQ_MOVE_PCNTXT; + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); } else set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); @@ -3484,13 +3464,10 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - unsigned int irq; - int ret, sub_handle; + int node, ret, sub_handle, index = 0; + unsigned int irq, irq_want; struct msi_desc *msidesc; - unsigned int irq_want; struct intel_iommu *iommu = NULL; - int index = 0; - int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) @@ -3676,7 +3653,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) return ret; hpet_msi_write(get_irq_data(irq), &msg); - irq_set_status_flags(irq,IRQ_MOVE_PCNTXT); + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); if (irq_remapped(irq)) set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, handle_edge_irq, "edge"); @@ -3862,11 +3839,12 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, trigger = irq_attr->trigger; polarity = irq_attr->polarity; + cfg = get_irq_desc_chip_data(desc); + /* * IRQs < 16 are already in the irq_2_pin[] map */ if (irq >= legacy_pic->nr_legacy_irqs) { - cfg = get_irq_desc_chip_data(desc); if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", pin, irq); @@ -3874,7 +3852,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, } } - setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); + setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity); return 0; } @@ -4258,13 +4236,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) void __init pre_init_apic_IRQ0(void) { struct irq_cfg *cfg; - struct irq_desc *desc; printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); #endif - desc = irq_to_desc_alloc_node(0, 0); + irq_to_desc_alloc_node(0, 0); setup_local_APIC(); @@ -4272,5 +4249,5 @@ void __init pre_init_apic_IRQ0(void) add_pin_to_irq_node(cfg, 0, 0, 0); set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); + setup_ioapic_irq(0, 0, 0, cfg, 0, 0); } -- cgit v1.2.3 From f7e909eae444ff733ecc5628af76d89c363ab480 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 21:40:23 +0200 Subject: x86: Prepare the affinity common functions for taking struct irq_data * While at it rename it to sensible function names and fix the return value from unsigned to int for __ioapic_set_affinity (set_desc_affinity). Returning -1 in a function returning unsigned int is somewhat strange. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 6 ++-- arch/x86/kernel/apic/io_apic.c | 77 ++++++++++++++++-------------------------- arch/x86/kernel/uv_irq.c | 2 +- 3 files changed, 33 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 46c0fe05f230..76848f27b1ac 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -95,9 +95,9 @@ extern struct irq_cfg *irq_cfg(unsigned int); extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); extern void send_cleanup_vector(struct irq_cfg *); -struct irq_desc; -extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *, - unsigned int *dest_id); +struct irq_data; +int __ioapic_set_affinity(struct irq_data *, const struct cpumask *, + unsigned int *dest_id); extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 82c3c66e333f..60ca9a47087d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2279,65 +2279,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } /* - * Either sets desc->affinity to a valid value, and returns + * Either sets data->affinity to a valid value, and returns * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves desc->affinity untouched. + * leaves data->affinity untouched. */ -unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, - unsigned int *dest_id) +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) { - struct irq_cfg *cfg; - unsigned int irq; + struct irq_cfg *cfg = data->chip_data; if (!cpumask_intersects(mask, cpu_online_mask)) return -1; - irq = desc->irq; - cfg = get_irq_desc_chip_data(desc); - if (assign_irq_vector(irq, cfg, mask)) + if (assign_irq_vector(data->irq, data->chip_data, mask)) return -1; - cpumask_copy(desc->affinity, mask); + cpumask_copy(data->affinity, mask); - *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); return 0; } static int -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_cfg *cfg; + unsigned int dest, irq = data->irq; unsigned long flags; - unsigned int dest; - unsigned int irq; - int ret = -1; - - irq = desc->irq; - cfg = get_irq_desc_chip_data(desc); + int ret; raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = set_desc_affinity(desc, mask, &dest); + ret = __ioapic_set_affinity(data, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); + __target_IO_APIC_irq(irq, dest, data->chip_data); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; } -static int -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - return set_ioapic_affinity_irq_desc(desc, mask); -} - #ifdef CONFIG_INTR_REMAP /* @@ -2668,16 +2649,16 @@ static void ir_ack_apic_level(struct irq_data *data) #endif /* CONFIG_INTR_REMAP */ static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .irq_startup = startup_ioapic_irq, - .irq_mask = mask_ioapic_irq, - .irq_unmask = unmask_ioapic_irq, - .irq_ack = ack_apic_edge, - .irq_eoi = ack_apic_level, + .name = "IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = ack_apic_edge, + .irq_eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .irq_set_affinity = ioapic_set_affinity, #endif - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip ir_ioapic_chip __read_mostly = { @@ -3327,7 +3308,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) return -1; cfg = get_irq_desc_chip_data(desc); @@ -3359,7 +3340,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) if (get_irte(irq, &irte)) return -1; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3534,7 +3515,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) return -1; cfg = get_irq_desc_chip_data(desc); @@ -3590,7 +3571,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, struct msi_msg msg; unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) return -1; hpet_msi_read(data->handler_data, &msg); @@ -3693,7 +3674,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irq_cfg *cfg; unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) return -1; cfg = get_irq_desc_chip_data(desc); @@ -4045,14 +4026,14 @@ void __init setup_ioapic_dest(void) */ if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = desc->irq_data.affinity; else mask = apic->target_cpus(); if (intr_remapping_enabled) set_ir_ioapic_affinity_irq_desc(desc, mask); else - set_ioapic_affinity_irq_desc(desc, mask); + ioapic_set_affinity(&desc->irq_data, mask, false); } } diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 2233a42fb907..56a42e11ae07 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -216,7 +216,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) unsigned long mmr_offset; int mmr_pnode; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) return -1; mmr_value = 0; -- cgit v1.2.3 From 5346b2a78fa3b900da672928978475acdd4632dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 21:49:03 +0200 Subject: x86: Convert msi affinity setting to new chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 60ca9a47087d..268c5450392b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3301,26 +3301,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, } #ifdef CONFIG_SMP -static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int +msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; struct msi_msg msg; unsigned int dest; - if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; - cfg = get_irq_desc_chip_data(desc); - - __get_cached_msi_msg(desc->irq_data.msi_desc, &msg); + __get_cached_msi_msg(data->msi_desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - __write_msi_msg(desc->irq_data.msi_desc, &msg); + __write_msi_msg(data->msi_desc, &msg); return 0; } @@ -3370,14 +3368,14 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) * which implement the MSI or MSI-X Capability Structure. */ static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .irq_unmask = unmask_msi_irq, - .irq_mask = mask_msi_irq, - .irq_ack = ack_apic_edge, + .name = "PCI-MSI", + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, + .irq_set_affinity = msi_set_affinity, #endif - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip msi_ir_chip = { -- cgit v1.2.3 From f19f5ecc920215decfea54f26e3eb14064506675 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 21:50:22 +0200 Subject: x86: Convert remapped ioapic affinity setting to new irq chip function Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Suresh Siddha --- arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 268c5450392b..49cc27d5658d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2333,24 +2333,21 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, * the interrupt-remapping table entry. */ static int -migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; struct irte irte; - unsigned int dest; - unsigned int irq; - int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return ret; + return -EINVAL; - irq = desc->irq; if (get_irte(irq, &irte)) - return ret; + return -EBUSY; - cfg = get_irq_desc_chip_data(desc); if (assign_irq_vector(irq, cfg, mask)) - return ret; + return -EBUSY; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2365,29 +2362,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) if (cfg->move_in_progress) send_cleanup_vector(cfg); - cpumask_copy(desc->affinity, mask); - + cpumask_copy(data->affinity, mask); return 0; } -/* - * Migrates the IRQ destination in the process context. - */ -static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, - const struct cpumask *mask) -{ - return migrate_ioapic_irq_desc(desc, mask); -} -static int set_ir_ioapic_affinity_irq(unsigned int irq, - const struct cpumask *mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return set_ir_ioapic_affinity_irq_desc(desc, mask); -} #else -static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, - const struct cpumask *mask) +static inline int +ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { return 0; } @@ -2662,18 +2644,18 @@ static struct irq_chip ioapic_chip __read_mostly = { }; static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .irq_startup = startup_ioapic_irq, - .irq_mask = mask_ioapic_irq, - .irq_unmask = unmask_ioapic_irq, + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, #ifdef CONFIG_INTR_REMAP - .irq_ack = ir_ack_apic_edge, - .irq_eoi = ir_ack_apic_level, + .irq_ack = ir_ack_apic_edge, + .irq_eoi = ir_ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, + .irq_set_affinity = ir_ioapic_set_affinity, #endif #endif - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static inline void init_IO_APIC_traps(void) @@ -4029,7 +4011,7 @@ void __init setup_ioapic_dest(void) mask = apic->target_cpus(); if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); + ir_ioapic_set_affinity(&desc->irq_data, mask, false); else ioapic_set_affinity(&desc->irq_data, mask, false); } -- cgit v1.2.3 From b5d1c465794f521c352d9c1a33159750c9c3fa84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 22:15:49 +0200 Subject: x86: Convert remapped msi to new chip.irq_set_affinity function Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Suresh Siddha Cc: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 49cc27d5658d..13f8e28ba4d8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3310,17 +3310,17 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) * done in the process context using interrupt-remapping hardware. */ static int -ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); - unsigned int dest; + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; struct irte irte; if (get_irte(irq, &irte)) return -1; - if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3361,16 +3361,16 @@ static struct irq_chip msi_chip = { }; static struct irq_chip msi_ir_chip = { - .name = "IR-PCI-MSI", - .irq_unmask = unmask_msi_irq, - .irq_mask = mask_msi_irq, + .name = "IR-PCI-MSI", + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, #ifdef CONFIG_INTR_REMAP - .irq_ack = ir_ack_apic_edge, + .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, + .irq_set_affinity = ir_msi_set_affinity, #endif #endif - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; /* @@ -3569,16 +3569,16 @@ static int hpet_msi_set_affinity(struct irq_data *data, #endif /* CONFIG_SMP */ static struct irq_chip ir_hpet_msi_type = { - .name = "IR-HPET_MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, + .name = "IR-HPET_MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, #ifdef CONFIG_INTR_REMAP - .irq_ack = ir_ack_apic_edge, + .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, + .irq_set_affinity = ir_msi_set_affinity, #endif #endif - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip hpet_msi_type = { -- cgit v1.2.3 From fe52b2d25919eaa01c51651a664f4f2ba6bd2a11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 22:19:29 +0200 Subject: x86: Convert dmar affinity setting to new chip function Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: David Woodhouse --- arch/x86/kernel/apic/io_apic.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 13f8e28ba4d8..6f8ac4c542ba 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3488,18 +3488,17 @@ void arch_teardown_msi_irq(unsigned int irq) #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int +dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; struct msi_msg msg; - unsigned int dest; - if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; - cfg = get_irq_desc_chip_data(desc); - dmar_msi_read(irq, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; @@ -3515,14 +3514,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ static struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .irq_unmask = dmar_msi_unmask, - .irq_mask = dmar_msi_mask, - .irq_ack = ack_apic_edge, + .name = "DMAR_MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = dmar_msi_set_affinity, + .irq_set_affinity = dmar_msi_set_affinity, #endif - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_dmar_msi(unsigned int irq) -- cgit v1.2.3 From 0e09ddf2d71aeff92ff8055ac7600b85c255ee85 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 22:21:26 +0200 Subject: x86: Cleanup hpet affinity setting Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6f8ac4c542ba..0b9ec3cb311f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3545,12 +3545,11 @@ int arch_setup_dmar_msi(unsigned int irq) static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_desc *desc = irq_to_desc(data->irq); struct irq_cfg *cfg = data->chip_data; struct msi_msg msg; unsigned int dest; - if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; hpet_msi_read(data->handler_data, &msg); -- cgit v1.2.3 From be5b7bf73802a9391158d9fcc0bc6b07670c73a5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 22:31:46 +0200 Subject: x86: Convert ht set_affinity to new chip function Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0b9ec3cb311f..b144f7a95970 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3646,33 +3646,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int +ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; unsigned int dest; - if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; - cfg = get_irq_desc_chip_data(desc); - - target_ht_irq(irq, dest, cfg->vector); - + target_ht_irq(data->irq, dest, cfg->vector); return 0; } #endif static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .irq_mask = mask_ht_irq, - .irq_unmask = unmask_ht_irq, - .irq_ack = ack_apic_edge, + .name = "PCI-HT", + .irq_mask = mask_ht_irq, + .irq_unmask = unmask_ht_irq, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, + .irq_set_affinity = ht_set_affinity, #endif - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -- cgit v1.2.3 From 7e495529b62cf462eb2d8875fe15ca446b8e1f94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 23:31:50 +0200 Subject: x86: ioapic: Cleanup some more Cleanup after the irq_chip conversion a bit. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b144f7a95970..43030995dcce 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -131,13 +131,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int node) +static struct irq_pin_list *alloc_irq_pin_list(int node) { - struct irq_pin_list *pin; - - pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); - - return pin; + return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); } /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -232,7 +228,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) if (!old_entry) return; - entry = get_one_free_irq_2_pin(node); + entry = alloc_irq_pin_list(node); if (!entry) return; @@ -242,7 +238,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(node); + entry = alloc_irq_pin_list(node); if (!entry) { entry = head; while (entry) { @@ -471,7 +467,7 @@ static void ioapic_mask_entry(int apic, int pin) * fast in the common case, and fast for shared ISA-space IRQs. */ static int -add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) +__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -483,7 +479,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) last = &entry->next; } - entry = get_one_free_irq_2_pin(node); + entry = alloc_irq_pin_list(node); if (!entry) { printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin); @@ -498,7 +494,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { - if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + if (__add_pin_to_irq_node(cfg, node, apic, pin)) panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } @@ -3801,7 +3797,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, * IRQs < 16 are already in the irq_2_pin[] map */ if (irq >= legacy_pic->nr_legacy_irqs) { - if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", pin, irq); return 0; -- cgit v1.2.3 From 6e2fff50a5bd72a3f9e6f3ef6e9137efddb2d580 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Oct 2010 22:07:03 +0200 Subject: x86: ioapic: Cleanup get_one_free_irq_cfg() Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 43030995dcce..452f781a042e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -185,19 +185,18 @@ static struct irq_cfg *get_one_free_irq_cfg(int node) struct irq_cfg *cfg; cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); - if (cfg) { - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { - kfree(cfg); - cfg = NULL; - } else if (!zalloc_cpumask_var_node(&cfg->old_domain, - GFP_ATOMIC, node)) { - free_cpumask_var(cfg->domain); - kfree(cfg); - cfg = NULL; - } - } - + if (!cfg) + return NULL; + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) + goto out_cfg; + if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) + goto out_domain; return cfg; +out_domain: + free_cpumask_var(cfg->domain); +out_cfg: + kfree(cfg); + return NULL; } int arch_init_chip_data(struct irq_desc *desc, int node) -- cgit v1.2.3 From 08c33db6d044d9dc74ddf8d9ee3cb1fa3eca262b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Oct 2010 22:14:21 +0200 Subject: x86: Implement new allocator functions Implement new allocator functions which make use of the core changes. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 54 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 452f781a042e..065c5dc88b8c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -199,6 +199,13 @@ out_cfg: return NULL; } +static void free_irq_cfg(struct irq_cfg *cfg) +{ + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); +} + int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; @@ -299,13 +306,6 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, init_copy_irq_2_pin(old_cfg, cfg, node); } -static void free_irq_cfg(struct irq_cfg *cfg) -{ - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); -} - void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) { struct irq_cfg *old_cfg, *cfg; @@ -325,13 +325,53 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) /* end for move_irq_desc */ #else + struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } +static struct irq_cfg *get_one_free_irq_cfg(unsigned int irq, int node) +{ + return irq_cfgx + irq; +} + +static inline void free_irq_cfg(struct irq_cfg *cfg) { } + #endif +static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) +{ + int res = irq_alloc_desc_at(at, node); + struct irq_cfg *cfg; + + if (res < 0) { + if (res != -EEXIST) + return NULL; + cfg = get_irq_chip_data(at); + if (cfg) + return cfg; + } + + cfg = get_one_free_irq_cfg(node); + if (cfg) + set_irq_chip_data(at, cfg); + else + irq_free_desc(at); + return cfg; +} + +static int alloc_irq_from(unsigned int from, int node) +{ + return irq_alloc_desc_from(from, node); +} + +static void free_irq_at(unsigned int at, struct irq_cfg *cfg) +{ + free_irq_cfg(cfg); + irq_free_desc(at); +} + struct io_apic { unsigned int index; unsigned int unused[3]; -- cgit v1.2.3 From f981a3dc1941035a108da1276c448de6b10ddac9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 10:44:21 +0200 Subject: x86: io_apic: Prepare alloc/free_irq_cfg() Rename the grossly misnamed get_one_free_irq_cfg() to alloc_irq_cfg(). Add a (not yet used) irq number argument to free_irq_cfg() Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 065c5dc88b8c..06da8fe2647e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -180,7 +180,7 @@ struct irq_cfg *irq_cfg(unsigned int irq) return get_irq_chip_data(irq); } -static struct irq_cfg *get_one_free_irq_cfg(int node) +static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) { struct irq_cfg *cfg; @@ -199,7 +199,7 @@ out_cfg: return NULL; } -static void free_irq_cfg(struct irq_cfg *cfg) +static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { free_cpumask_var(cfg->domain); free_cpumask_var(cfg->old_domain); @@ -212,7 +212,7 @@ int arch_init_chip_data(struct irq_desc *desc, int node) cfg = get_irq_desc_chip_data(desc); if (!cfg) { - cfg = get_one_free_irq_cfg(node); + cfg = alloc_irq_cfg(desc->irq, node); desc->chip_data = cfg; if (!cfg) { printk(KERN_ERR "can not alloc irq_cfg\n"); @@ -289,7 +289,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(node); + cfg = alloc_irq_cfg(desc->irq, node); if (!cfg) return; @@ -318,7 +318,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) if (old_cfg) { free_irq_2_pin(old_cfg, cfg); - free_irq_cfg(old_cfg); + free_irq_cfg(old_desc->irq, old_cfg); old_desc->chip_data = NULL; } } @@ -331,12 +331,12 @@ struct irq_cfg *irq_cfg(unsigned int irq) return irq < nr_irqs ? irq_cfgx + irq : NULL; } -static struct irq_cfg *get_one_free_irq_cfg(unsigned int irq, int node) +static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) { return irq_cfgx + irq; } -static inline void free_irq_cfg(struct irq_cfg *cfg) { } +static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { } #endif @@ -353,7 +353,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } - cfg = get_one_free_irq_cfg(node); + cfg = alloc_irq_cfg(at, node); if (cfg) set_irq_chip_data(at, cfg); else @@ -368,7 +368,7 @@ static int alloc_irq_from(unsigned int from, int node) static void free_irq_at(unsigned int at, struct irq_cfg *cfg) { - free_irq_cfg(cfg); + free_irq_cfg(at, cfg); irq_free_desc(at); } -- cgit v1.2.3 From fe6dab4e79e82ec35879bfe1a8968b7d15ac0d91 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 8 Oct 2010 22:44:02 -0700 Subject: x86: Don't setup ioapic irq for sci twice The sparseirq rework triggered a warning in the iommu code, which was caused by setting up ioapic for ACPI irq 9 twice. This function is solely to handle interrupts which are on a secondary ioapic and outside the legacy irq range. Replace the sparse irq_to_desc check with a non ifdeffed version. [ tglx: Moved it before the ioapic sparse conversion and simplified the inverse logic ] Signed-off-by: Yinghai Lu LKML-Reference: <4CB00122.3030301@kernel.org> Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 06da8fe2647e..5aae718a7133 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1565,11 +1565,11 @@ void setup_IO_APIC_irq_extra(u32 gsi) return; irq = pin_2_irq(idx, apic_id, pin); -#ifdef CONFIG_SPARSE_IRQ - desc = irq_to_desc(irq); - if (desc) + + /* Only handle the non legacy irqs on secondary ioapics */ + if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; -#endif + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); -- cgit v1.2.3 From fbc6bff04a095e049be290ff6f6ac68839166bd6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 20:34:53 +0200 Subject: x86: ioapic: Cleanup sparse irq code Switch over to the new allocator and remove all the magic which was caused by the unability to destroy irq descriptors. Get rid of the create_irq_nr() loop for sparse and non sparse irq. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/Kconfig | 1 - arch/x86/kernel/apic/io_apic.c | 107 ++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3ec657f7ee70..8cc510874e1b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,7 +61,6 @@ config X86 select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ - select NUMA_IRQ_DESC if (SPARSE_IRQ && NUMA) select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5aae718a7133..ed340297571e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -157,6 +157,9 @@ int __init arch_early_irq_init(void) count = ARRAY_SIZE(irq_cfgx); node = cpu_to_node(0); + /* Make sure the legacy interrupts are marked in the bitmap */ + irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); + for (i = 0; i < count; i++) { set_irq_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); @@ -201,11 +204,15 @@ out_cfg: static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { + if (!cfg) + return; + set_irq_chip_data(at, NULL); free_cpumask_var(cfg->domain); free_cpumask_var(cfg->old_domain); kfree(cfg); } +#if 0 int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; @@ -323,6 +330,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) } } /* end for move_irq_desc */ +#endif #else @@ -1479,11 +1487,9 @@ static struct { static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; - int notcon = 0; - struct irq_desc *desc; - struct irq_cfg *cfg; + int apic_id, pin, idx, irq, notcon = 0; int node = cpu_to_node(0); + struct irq_cfg *cfg; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1520,12 +1526,10 @@ static void __init setup_IO_APIC_irqs(void) apic->multi_timer_check(apic_id, irq)) continue; - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); + cfg = alloc_irq_and_cfg_at(irq, node); + if (!cfg) continue; - } - cfg = get_irq_desc_chip_data(desc); + add_pin_to_irq_node(cfg, node, apic_id, pin); /* * don't mark it in pin_programmed, so later acpi could @@ -1547,9 +1551,7 @@ static void __init setup_IO_APIC_irqs(void) */ void setup_IO_APIC_irq_extra(u32 gsi) { - int apic_id = 0, pin, idx, irq; - int node = cpu_to_node(0); - struct irq_desc *desc; + int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); struct irq_cfg *cfg; /* @@ -1570,13 +1572,10 @@ void setup_IO_APIC_irq_extra(u32 gsi) if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); + cfg = alloc_irq_and_cfg_at(irq, node); + if (!cfg) return; - } - cfg = get_irq_desc_chip_data(desc); add_pin_to_irq_node(cfg, node, apic_id, pin); if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { @@ -3177,44 +3176,37 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want, int node) +unsigned int create_irq_nr(unsigned int from, int node) { - /* Allocate an unused irq */ - unsigned int irq; - unsigned int new; + struct irq_cfg *cfg; unsigned long flags; - struct irq_cfg *cfg_new = NULL; - struct irq_desc *desc_new = NULL; - - irq = 0; - if (irq_want < nr_irqs_gsi) - irq_want = nr_irqs_gsi; - - raw_spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_node(new, node); - if (!desc_new) { - printk(KERN_INFO "can not get irq_desc for %d\n", new); - continue; - } - cfg_new = get_irq_desc_chip_data(desc_new); - - if (cfg_new->vector != 0) - continue; + unsigned int ret = 0; + int irq; - desc_new = move_irq_desc(desc_new, node); - cfg_new = get_irq_desc_chip_data(desc_new); + if (from < nr_irqs_gsi) + from = nr_irqs_gsi; - if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) - irq = new; - break; + irq = alloc_irq_from(from, node); + if (irq < 0) + return 0; + cfg = alloc_irq_cfg(irq, node); + if (!cfg) { + free_irq_at(irq, NULL); + return 0; } - raw_spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) - dynamic_irq_init_keep_chip_data(irq); + raw_spin_lock_irqsave(&vector_lock, flags); + if (!__assign_irq_vector(irq, cfg, apic->target_cpus())) + ret = irq; + raw_spin_unlock_irqrestore(&vector_lock, flags); - return irq; + if (ret) { + set_irq_chip_data(irq, cfg); + irq_clear_status_flags(irq, IRQ_NOREQUEST); + } else { + free_irq_at(irq, cfg); + } + return ret; } int create_irq(void) @@ -3234,14 +3226,16 @@ int create_irq(void) void destroy_irq(unsigned int irq) { + struct irq_cfg *cfg = get_irq_chip_data(irq); unsigned long flags; - dynamic_irq_cleanup_keep_chip_data(irq); + irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); free_irte(irq); raw_spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, get_irq_chip_data(irq)); + __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); + free_irq_at(irq, cfg); } /* @@ -3802,7 +3796,6 @@ int __init arch_probe_nr_irqs(void) static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { - struct irq_desc *desc; struct irq_cfg *cfg; int node; int ioapic, pin; @@ -3820,18 +3813,14 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, else node = cpu_to_node(0); - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); + cfg = alloc_irq_and_cfg_at(irq, node); + if (!cfg) return 0; - } pin = irq_attr->ioapic_pin; trigger = irq_attr->trigger; polarity = irq_attr->polarity; - cfg = get_irq_desc_chip_data(desc); - /* * IRQs < 16 are already in the irq_2_pin[] map */ @@ -4232,11 +4221,11 @@ void __init pre_init_apic_IRQ0(void) #ifndef CONFIG_SMP phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); #endif - irq_to_desc_alloc_node(0, 0); + /* Make sure the irq descriptor is set up */ + cfg = alloc_irq_and_cfg_at(0, 0); setup_local_APIC(); - cfg = irq_cfg(0); add_pin_to_irq_node(cfg, 0, 0, 0); set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); -- cgit v1.2.3 From bc5fdf9f3aad37406b3c8d635a7940cd65de0c12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Oct 2010 10:40:53 +0200 Subject: x86: io_apic: Remove the now unused sparse_irq arch_* functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 120 ----------------------------------------- 1 file changed, 120 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ed340297571e..6ff6bb883c58 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -212,126 +212,6 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) kfree(cfg); } -#if 0 -int arch_init_chip_data(struct irq_desc *desc, int node) -{ - struct irq_cfg *cfg; - - cfg = get_irq_desc_chip_data(desc); - if (!cfg) { - cfg = alloc_irq_cfg(desc->irq, node); - desc->chip_data = cfg; - if (!cfg) { - printk(KERN_ERR "can not alloc irq_cfg\n"); - BUG_ON(1); - } - } - - return 0; -} - -/* for move_irq_desc */ -static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) -{ - struct irq_pin_list *old_entry, *head, *tail, *entry; - - cfg->irq_2_pin = NULL; - old_entry = old_cfg->irq_2_pin; - if (!old_entry) - return; - - entry = alloc_irq_pin_list(node); - if (!entry) - return; - - entry->apic = old_entry->apic; - entry->pin = old_entry->pin; - head = entry; - tail = entry; - old_entry = old_entry->next; - while (old_entry) { - entry = alloc_irq_pin_list(node); - if (!entry) { - entry = head; - while (entry) { - head = entry->next; - kfree(entry); - entry = head; - } - /* still use the old one */ - return; - } - entry->apic = old_entry->apic; - entry->pin = old_entry->pin; - tail->next = entry; - tail = entry; - old_entry = old_entry->next; - } - - tail->next = NULL; - cfg->irq_2_pin = head; -} - -static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry, *next; - - if (old_cfg->irq_2_pin == cfg->irq_2_pin) - return; - - entry = old_cfg->irq_2_pin; - - while (entry) { - next = entry->next; - kfree(entry); - entry = next; - } - old_cfg->irq_2_pin = NULL; -} - -void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int node) -{ - struct irq_cfg *cfg; - struct irq_cfg *old_cfg; - - cfg = alloc_irq_cfg(desc->irq, node); - - if (!cfg) - return; - - desc->chip_data = cfg; - - old_cfg = old_desc->chip_data; - - cfg->vector = old_cfg->vector; - cfg->move_in_progress = old_cfg->move_in_progress; - cpumask_copy(cfg->domain, old_cfg->domain); - cpumask_copy(cfg->old_domain, old_cfg->old_domain); - - init_copy_irq_2_pin(old_cfg, cfg, node); -} - -void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) -{ - struct irq_cfg *old_cfg, *cfg; - - old_cfg = get_irq_desc_chip_data(old_desc); - cfg = get_irq_desc_chip_data(desc); - - if (old_cfg == cfg) - return; - - if (old_cfg) { - free_irq_2_pin(old_cfg, cfg); - free_irq_cfg(old_desc->irq, old_cfg); - old_desc->chip_data = NULL; - } -} -/* end for move_irq_desc */ -#endif - #else struct irq_cfg *irq_cfg(unsigned int irq) -- cgit v1.2.3 From 423f085952fd7253407cb92984cc2d495a564481 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 10 Oct 2010 11:39:09 +0200 Subject: x86: Embedd irq_2_iommu into irq_cfg That interrupt remapping code is x86 specific and tied to the io_apic code. No need for separate allocator functions in the interrupt remapping code. This allows to simplify the code and irq_2_iommu is small (13 bytes on 64bit) so it's not a real problem even if interrupt remapping is runtime disabled. If it's compile time disabled the impact is zero. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Suresh Siddha Cc: David Woodhouse Cc: Jesse Barnes --- arch/x86/include/asm/hw_irq.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 76848f27b1ac..e756c4bfed94 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -78,6 +78,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, irq_attr->polarity = polarity; } +struct irq_2_iommu { + struct intel_iommu *iommu; + u16 irte_index; + u16 sub_handle; + u8 irte_mask; +}; + /* * This is performance-critical, we want to do it O(1) * @@ -89,6 +96,9 @@ struct irq_cfg { cpumask_var_t old_domain; u8 vector; u8 move_in_progress : 1; +#ifdef CONFIG_INTR_REMAP + struct irq_2_iommu irq_2_iommu; +#endif }; extern struct irq_cfg *irq_cfg(unsigned int); -- cgit v1.2.3 From 1a0730d6649113c820217387a011a17dd4aff3ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 11 Oct 2010 11:55:37 +0200 Subject: x86: Speed up the irq_remapped check in hot pathes irq_2_iommu is in struct irq_cfg, so we can do the irq_remapped check based on irq_cfg instead of going through a lookup function. That's especially interesting in the eoi_ioapic_irq() hotpath. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Suresh Siddha Cc: David Woodhouse Cc: Jesse Barnes --- arch/x86/include/asm/irq_remapping.h | 8 ++++++++ arch/x86/kernel/apic/io_apic.c | 12 ++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 8d841505344e..1c23360fb2d8 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -24,10 +24,18 @@ static inline void prepare_irte(struct irte *irte, int vector, irte->dest_id = IRTE_DEST(dest); irte->redir_hint = 1; } +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return cfg->irq_2_iommu.iommu != NULL; +} #else static void prepare_irte(struct irte *irte, int vector, unsigned int dest) { } +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return false; +} #endif #endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6ff6bb883c58..1b8e8a106120 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1237,7 +1237,7 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger) else irq_clear_status_flags(irq, IRQ_LEVEL); - if (irq_remapped(irq)) { + if (irq_remapped(get_irq_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); if (trigger) set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, @@ -2183,7 +2183,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ - if (!irq_remapped(irq)) + if (!irq_remapped(cfg)) io_apic_write(apic, 0x11 + pin*2, dest); reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; @@ -2415,7 +2415,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) * intr-remapping table entry. Hence for the io-apic * EOI we use the pin number. */ - if (irq_remapped(irq)) + if (irq_remapped(cfg)) io_apic_eoi(entry->apic, entry->pin); else io_apic_eoi(entry->apic, cfg->vector); @@ -3139,7 +3139,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); - if (irq_remapped(irq)) { + if (irq_remapped(get_irq_chip_data(irq))) { struct irte irte; int ir_index; u16 sub_handle; @@ -3321,7 +3321,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); - if (irq_remapped(irq)) { + if (irq_remapped(get_irq_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); } else @@ -3522,7 +3522,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) hpet_msi_write(get_irq_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (irq_remapped(irq)) + if (irq_remapped(get_irq_chip_data(irq))) set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, handle_edge_irq, "edge"); else -- cgit v1.2.3 From 1a8ce7ff68d777195da2d340561bda610e533a64 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Oct 2010 21:08:56 +0200 Subject: x86: Make io_apic.c local functions static No users outside of io_apic.c Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 6 ------ arch/x86/kernel/apic/io_apic.c | 10 +++++----- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 9cb2edb87c2f..c8be4566c3d2 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -170,12 +170,6 @@ extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void probe_nr_irqs_gsi(void); -extern int setup_ioapic_entry(int apic, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector, int pin); -extern void ioapic_write_entry(int apic, int pin, - struct IO_APIC_route_entry e); extern void setup_ioapic_ids_from_mpc(void); struct mp_ioapic_gsi{ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1b8e8a106120..0102543b5647 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -364,7 +364,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) io_apic_write(apic, 0x10 + 2*pin, eu.w1); } -void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); @@ -1259,10 +1259,10 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger) handle_edge_irq, "edge"); } -int setup_ioapic_entry(int apic_id, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector, int pin) +static int setup_ioapic_entry(int apic_id, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector, int pin) { /* * add it to the IO-APIC irq-routing table: -- cgit v1.2.3 From 48b2650196364e4ef124efb841b63c2326e4ccb2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Sep 2010 11:43:08 +0200 Subject: x86: uv: Clean up the direct access to irq_desc Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 1 - arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/uv_irq.c | 55 +++++++++++++++--------------------------- 3 files changed, 20 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e756c4bfed94..d5905fd8ba41 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -101,7 +101,6 @@ struct irq_cfg { #endif }; -extern struct irq_cfg *irq_cfg(unsigned int); extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); extern void send_cleanup_vector(struct irq_cfg *); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0102543b5647..5193f201b916 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -178,7 +178,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -struct irq_cfg *irq_cfg(unsigned int irq) +static struct irq_cfg *irq_cfg(unsigned int irq) { return get_irq_chip_data(irq); } diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 56a42e11ae07..7b24460917d5 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -28,34 +28,21 @@ struct uv_irq_2_mmr_pnode{ static spinlock_t uv_irq_lock; static struct rb_root uv_irq_root; -static int uv_set_irq_affinity(unsigned int, const struct cpumask *); +static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); -static void uv_noop(unsigned int irq) -{ -} - -static unsigned int uv_noop_ret(unsigned int irq) -{ - return 0; -} +static void uv_noop(struct irq_data *data) { } -static void uv_ack_apic(unsigned int irq) +static void uv_ack_apic(struct irq_data *data) { ack_APIC_irq(); } static struct irq_chip uv_irq_chip = { - .name = "UV-CORE", - .startup = uv_noop_ret, - .shutdown = uv_noop, - .enable = uv_noop, - .disable = uv_noop, - .ack = uv_noop, - .mask = uv_noop, - .unmask = uv_noop, - .eoi = uv_ack_apic, - .end = uv_noop, - .set_affinity = uv_set_irq_affinity, + .name = "UV-CORE", + .irq_mask = uv_noop, + .irq_unmask = uv_noop, + .irq_eoi = uv_ack_apic, + .irq_set_affinity = uv_set_irq_affinity, }; /* @@ -144,26 +131,22 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; - int mmr_pnode; + struct irq_cfg *cfg = get_irq_chip_data(irq); unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; - int err; + int mmr_pnode, err; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, eligible_cpu); if (err != 0) return err; if (limit == UV_AFFINITY_CPU) - desc->status |= IRQ_NO_BALANCING; + irq_set_status_flags(irq, IRQ_NO_BALANCING); else - desc->status |= IRQ_MOVE_PCNTXT; + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, irq_name); @@ -206,17 +189,17 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); } -static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int +uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = get_irq_desc_chip_data(desc); + struct irq_cfg *cfg = data->chip_data; unsigned int dest; - unsigned long mmr_value; + unsigned long mmr_value, mmr_offset; struct uv_IO_APIC_route_entry *entry; - unsigned long mmr_offset; int mmr_pnode; - if (__ioapic_set_affinity(&desc->irq_data, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; mmr_value = 0; @@ -231,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) entry->dest = dest; /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode)) return -1; uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -- cgit v1.2.3 From ad9f43340f48c5f7a0a5ef7656986e23d06bf996 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Sep 2010 11:26:43 +0200 Subject: x86: Use sane enumeration Instead of looping through all interrupts, use the bitmap lookup to find the next. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5193f201b916..057b0e13d1c3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1160,7 +1160,6 @@ void __setup_vector_irq(int cpu) /* Initialize vector_irq on a new cpu */ int irq, vector; struct irq_cfg *cfg; - struct irq_desc *desc; /* * vector_lock will make sure that we don't run into irq vector @@ -1169,9 +1168,10 @@ void __setup_vector_irq(int cpu) */ raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ - for_each_irq_desc(irq, desc) { - cfg = get_irq_desc_chip_data(desc); - + for_each_active_irq(irq) { + cfg = get_irq_chip_data(irq); + if (!cfg) + continue; /* * If it is a legacy IRQ handled by the legacy PIC, this cpu * will be part of the irq_cfg's domain. @@ -1516,7 +1516,6 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; - struct irq_desc *desc; unsigned int irq; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); @@ -1603,10 +1602,10 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_desc(irq, desc) { + for_each_active_irq(irq) { struct irq_pin_list *entry; - cfg = get_irq_desc_chip_data(desc); + cfg = get_irq_chip_data(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -2574,9 +2573,8 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { - int irq; - struct irq_desc *desc; struct irq_cfg *cfg; + unsigned int irq; /* * NOTE! The local APIC isn't very good at handling @@ -2589,8 +2587,8 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_desc(irq, desc) { - cfg = get_irq_desc_chip_data(desc); + for_each_active_irq(irq) { + cfg = get_irq_chip_data(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, @@ -2601,7 +2599,7 @@ static inline void init_IO_APIC_traps(void) legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ - desc->chip = &no_irq_chip; + set_irq_chip(irq, &no_irq_chip); } } } -- cgit v1.2.3 From c2f31c37b7303150ffcce53f6c6ed4ac62952b34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Sep 2010 12:19:03 +0200 Subject: x86: lguest: Use new irq allocator Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Rusty Russell --- arch/x86/lguest/boot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 2d4e6fcac836..73b1e1a1f489 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -838,12 +838,12 @@ static void __init lguest_init_IRQ(void) * rather than set them in lguest_init_IRQ we are called here every time an * lguest device needs an interrupt. * - * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should + * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should * pass that up! */ void lguest_setup_irq(unsigned int irq) { - irq_to_desc_alloc_node(irq, 0); + irq_alloc_desc_at(irq, 0); set_irq_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); } -- cgit v1.2.3 From 2ee39065988d632b403f8470942b0b5edee3632b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Oct 2010 16:28:51 +0200 Subject: x86: Switch sparse_irq allocations to GFP_KERNEL No callers from atomic context (except boot) anymore. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 057b0e13d1c3..20e47e045395 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -133,7 +133,7 @@ struct irq_pin_list { static struct irq_pin_list *alloc_irq_pin_list(int node) { - return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); + return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); } /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -162,8 +162,8 @@ int __init arch_early_irq_init(void) for (i = 0; i < count; i++) { set_irq_chip_data(i, &cfg[i]); - zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); - zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. @@ -187,12 +187,12 @@ static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) { struct irq_cfg *cfg; - cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); if (!cfg) return NULL; - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) goto out_cfg; - if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) + if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) goto out_domain; return cfg; out_domain: @@ -595,14 +595,14 @@ struct IO_APIC_route_entry **alloc_ioapic_entries(void) struct IO_APIC_route_entry **ioapic_entries; ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, - GFP_ATOMIC); + GFP_KERNEL); if (!ioapic_entries) return 0; for (apic = 0; apic < nr_ioapics; apic++) { ioapic_entries[apic] = kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_ATOMIC); + nr_ioapic_registers[apic], GFP_KERNEL); if (!ioapic_entries[apic]) goto nomem; } -- cgit v1.2.3 From bf1ebf007911d70a89de9a4a1b36d403e8eb064b Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sun, 10 Oct 2010 10:40:32 +0100 Subject: x86, olpc: Add XO-1 poweroff support Add a pm_power_off handler for the OLPC XO-1 laptop. The driver can be built modular and follows the behaviour of the APM driver, setting pm_power_off to NULL on unload. However, the ability to unload the module will probably be removed (with a simple __module_get(THIS_MODULE)) if/when XO-1 suspend/resume support is added to this file at a later date. Signed-off-by: Daniel Drake LKML-Reference: <20101010094032.9AE669D401B@zog.reactivated.net> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 6 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/olpc-xo1.c | 140 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 arch/x86/kernel/olpc-xo1.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c25525571347..0bcb8765b1c0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2066,6 +2066,12 @@ config OLPC Add support for detecting the unique features of the OLPC XO hardware. +config OLPC_XO1 + tristate "OLPC XO-1 support" + depends on OLPC + ---help--- + Add support for non-essential features of the OLPC XO-1 laptop. + config OLPC_OPENFIRMWARE bool "Support for OLPC's Open Firmware" depends on !X86_64 && !X86_PAE diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0925676266bd..e9777791e39c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o obj-$(CONFIG_X86_MRST) += mrst.o diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c new file mode 100644 index 000000000000..f5442c03abc3 --- /dev/null +++ b/arch/x86/kernel/olpc-xo1.c @@ -0,0 +1,140 @@ +/* + * Support for features of the OLPC XO-1 laptop + * + * Copyright (C) 2010 One Laptop per Child + * Copyright (C) 2006 Red Hat, Inc. + * Copyright (C) 2006 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define DRV_NAME "olpc-xo1" + +#define PMS_BAR 4 +#define ACPI_BAR 5 + +/* PMC registers (PMS block) */ +#define PM_SCLK 0x10 +#define PM_IN_SLPCTL 0x20 +#define PM_WKXD 0x34 +#define PM_WKD 0x30 +#define PM_SSC 0x54 + +/* PM registers (ACPI block) */ +#define PM1_CNT 0x08 +#define PM_GPE0_STS 0x18 + +static unsigned long acpi_base; +static unsigned long pms_base; + +static void xo1_power_off(void) +{ + printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); + + /* Enable all of these controls with 0 delay */ + outl(0x40000000, pms_base + PM_SCLK); + outl(0x40000000, pms_base + PM_IN_SLPCTL); + outl(0x40000000, pms_base + PM_WKXD); + outl(0x40000000, pms_base + PM_WKD); + + /* Clear status bits (possibly unnecessary) */ + outl(0x0002ffff, pms_base + PM_SSC); + outl(0xffffffff, acpi_base + PM_GPE0_STS); + + /* Write SLP_EN bit to start the machinery */ + outl(0x00002000, acpi_base + PM1_CNT); +} + +/* Read the base addresses from the PCI BAR info */ +static int __devinit setup_bases(struct pci_dev *pdev) +{ + int r; + + r = pci_enable_device_io(pdev); + if (r) { + dev_err(&pdev->dev, "can't enable device IO\n"); + return r; + } + + r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); + if (r) { + dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); + return r; + } + + r = pci_request_region(pdev, PMS_BAR, DRV_NAME); + if (r) { + dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); + pci_release_region(pdev, ACPI_BAR); + return r; + } + + acpi_base = pci_resource_start(pdev, ACPI_BAR); + pms_base = pci_resource_start(pdev, PMS_BAR); + + return 0; +} + +static int __devinit olpc_xo1_probe(struct platform_device *pdev) +{ + struct pci_dev *pcidev; + int r; + + pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, + NULL); + if (!pdev) + return -ENODEV; + + r = setup_bases(pcidev); + if (r) + return r; + + pm_power_off = xo1_power_off; + + printk(KERN_INFO "OLPC XO-1 support registered\n"); + return 0; +} + +static int __devexit olpc_xo1_remove(struct platform_device *pdev) +{ + pm_power_off = NULL; + return 0; +} + +static struct platform_driver olpc_xo1_driver = { + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + }, + .probe = olpc_xo1_probe, + .remove = __devexit_p(olpc_xo1_remove), +}; + +static int __init olpc_xo1_init(void) +{ + return platform_driver_register(&olpc_xo1_driver); +} + +static void __exit olpc_xo1_exit(void) +{ + platform_driver_unregister(&olpc_xo1_driver); +} + +MODULE_AUTHOR("Daniel Drake "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:olpc-xo1"); + +module_init(olpc_xo1_init); +module_exit(olpc_xo1_exit); -- cgit v1.2.3 From 5bcd757f93cc713cf61bbeefceda7539d9afca55 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 4 Oct 2010 14:59:31 -0400 Subject: x86/amd-iommu: Reenable AMD IOMMU if it's mysteriously vanished over suspend AMD's reference BIOS code had a bug that could result in the firmware failing to reenable the iommu on resume. It transpires that this causes certain less than desirable behaviour when it comes to PCI accesses, to whit them ending up somewhere near Bristol when the more desirable outcome was Edinburgh. Sadness ensues, perhaps along with filesystem corruption. Let's make sure that it gets turned back on, and that we restore its configuration so decisions it makes bear some resemblance to those made by reasonable people rather than crack-addled lemurs who spent all your DMA on Thunderbird. Signed-off-by: Matthew Garrett Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 21 ++++-- arch/x86/kernel/amd_iommu_init.c | 122 +++++++++++++++++++++++++++++---- 2 files changed, 123 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 08616180deaf..bdd20c8891e8 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -416,13 +416,22 @@ struct amd_iommu { struct dma_ops_domain *default_dom; /* - * This array is required to work around a potential BIOS bug. - * The BIOS may miss to restore parts of the PCI configuration - * space when the system resumes from S3. The result is that the - * IOMMU does not execute commands anymore which leads to system - * failure. + * We can't rely on the BIOS to restore all values on reinit, so we + * need to stash them */ - u32 cache_cfg[4]; + + /* The iommu BAR */ + u32 stored_addr_lo; + u32 stored_addr_hi; + + /* + * Each iommu has 6 l1s, each of which is documented as having 0x12 + * registers + */ + u32 stored_l1[6][0x12]; + + /* The l2 indirect registers */ + u32 stored_l2[0x83]; }; /* diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 5a170cbbbed8..44710d86a431 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size) return 1UL << shift; } +/* Access to l1 and l2 indexed register spaces */ + +static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); + pci_read_config_dword(iommu->dev, 0xfc, &val); + return val; +} + +static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); + pci_write_config_dword(iommu->dev, 0xfc, val); + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +} + +static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf0, address); + pci_read_config_dword(iommu->dev, 0xf4, &val); + return val; +} + +static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); + pci_write_config_dword(iommu->dev, 0xf4, val); +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; u32 range, misc; + int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); @@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); - if (is_rd890_iommu(iommu->dev)) { - pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); - pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); - pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); - pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); - } + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * Some rd890 systems may not be fully reconfigured by the BIOS, so + * it's necessary for us to store this information so it can be + * reprogrammed on resume + */ + + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, + &iommu->stored_addr_lo); + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, + &iommu->stored_addr_hi); + + /* Low bit locks writes to configuration space */ + iommu->stored_addr_lo &= ~1; + + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); + + for (i = 0; i < 0x83; i++) + iommu->stored_l2[i] = iommu_read_l2(iommu, i); } /* @@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_COHERENT_EN); } -static void iommu_apply_quirks(struct amd_iommu *iommu) +static void iommu_apply_resume_quirks(struct amd_iommu *iommu) { - if (is_rd890_iommu(iommu->dev)) { - pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); - pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); - pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); - pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); - } + int i, j; + u32 ioc_feature_control; + struct pci_dev *pdev = NULL; + + /* RD890 BIOSes may not have completely reconfigured the iommu */ + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * First, we need to ensure that the iommu is enabled. This is + * controlled by a register in the northbridge + */ + pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); + + if (!pdev) + return; + + /* Select Northbridge indirect register 0x75 and enable writing */ + pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); + pci_read_config_dword(pdev, 0x64, &ioc_feature_control); + + /* Enable the iommu */ + if (!(ioc_feature_control & 0x1)) + pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); + + pci_dev_put(pdev); + + /* Restore the iommu BAR */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo); + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, + iommu->stored_addr_hi); + + /* Restore the l1 indirect regs for each of the 6 l1s */ + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); + + /* Restore the l2 indirect regs */ + for (i = 0; i < 0x83; i++) + iommu_write_l2(iommu, i, iommu->stored_l2[i]); + + /* Lock PCI setup registers */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo | 1); } /* @@ -1147,7 +1237,6 @@ static void enable_iommus(void) for_each_iommu(iommu) { iommu_disable(iommu); - iommu_apply_quirks(iommu); iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); @@ -1173,6 +1262,11 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_apply_resume_quirks(iommu); + /* re-load the hardware */ enable_iommus(); -- cgit v1.2.3 From 5d0d71569e671239ae0d905ced9b65cd843f99ee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Oct 2010 11:13:21 +0200 Subject: x86/amd-iommu: Update copyright headers This patch updates the copyright headers in all source files of the AMD IOMMU driver. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 2 +- arch/x86/include/asm/amd_iommu_proto.h | 2 +- arch/x86/include/asm/amd_iommu_types.h | 2 +- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 5af2982133b5..f16a2caca1e0 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index cb030374b90a..916bc8111a01 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel * * This program is free software; you can redistribute it and/or modify it diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index bdd20c8891e8..e3509fc303bf 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 679b6450382b..d2fdb0826df2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 44710d86a431..3cb482e123de 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * -- cgit v1.2.3 From 447b1d43ded08c11f4f3d344b4e07e6dcddbef95 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Wed, 13 Oct 2010 19:10:42 +0100 Subject: x86, olpc: Register XO-1 platform devices The upcoming XO-1 rfkill driver (for drivers/platform/x86) will register itself with the name "xo1-rfkill", and the already-merged XO-1 poweroff code uses name "olpc-xo1" Add the necessary mechanics so that these devices are properly initialized on XO-1 laptops. Signed-off-by: Daniel Drake LKML-Reference: <20101013181042.90C8F9D401B@zog.reactivated.net> Cc: Matthew Garrett Signed-off-by: H. Peter Anvin --- arch/x86/kernel/olpc.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 747261302598..edaf3fe8dc5e 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -223,8 +224,25 @@ static bool __init platform_detect(void) return true; } +static int __init add_xo1_platform_devices(void) +{ + struct platform_device *pdev; + + pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + return 0; +} + static int __init olpc_init(void) { + int r = 0; + if (!olpc_ofw_present() || !platform_detect()) return 0; @@ -251,6 +269,12 @@ static int __init olpc_init(void) olpc_platform_info.boardrev >> 4, olpc_platform_info.ecver); + if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */ + r = add_xo1_platform_devices(); + if (r) + return r; + } + return 0; } -- cgit v1.2.3 From d7acb92fea932ad2e7846480aeacddc2c03c8485 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 13 Oct 2010 16:00:29 -0700 Subject: x86-64, asm: If the assembler supports fxsave64, use it Kbuild allows for us to probe for the existence of specific constructs in the assembler, use them to find out if we can use fxsave64 and permit the compiler to generate better code. Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 8 ++++++-- arch/x86/include/asm/i387.h | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8aa1b59b9074..40668a96ca0c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en # is .cfi_signal_frame supported too? cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) + +# does binutils support specific instructions? +asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) + +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a73a8d5a5e69..70f105b352ee 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -175,7 +175,7 @@ static inline void fpu_fxsave(struct fpu *fpu) uses any extended registers for addressing, a second REX prefix will be generated (to the assembler, rex64 followed by semicolon is a separate instruction), and hence the 64-bitness is lost. */ -#if 0 +#ifdef CONFIG_AS_FXSAVEQ /* Using "fxsaveq %0" would be the ideal choice, but is only supported starting with gas 2.16. */ __asm__ __volatile__("fxsaveq %0" -- cgit v1.2.3 From 0eead9ab41da33644ae2c97c57ad03da636a0422 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Oct 2010 10:57:40 -0700 Subject: Don't dump task struct in a.out core-dumps akiphie points out that a.out core-dumps have that odd task struct dumping that was never used and was never really a good idea (it goes back into the mists of history, probably the original core-dumping code). Just remove it. Also do the access_ok() check on dump_write(). It probably doesn't matter (since normal filesystems all seem to do it anyway), but he points out that it's normally done by the VFS layer, so ... [ I suspect that we should possibly do "vfs_write()" instead of calling ->write directly. That also does the whole fsnotify and write statistics thing, which may or may not be a good idea. ] And just to be anal, do this all for the x86-64 32-bit a.out emulation code too, even though it's not enabled (and won't currently even compile) Reported-by: akiphie Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 0350311906ae..2d93bdbc9ac0 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -34,7 +34,7 @@ #include #undef WARN_OLD -#undef CORE_DUMP /* probably broken */ +#undef CORE_DUMP /* definitely broken */ static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); static int load_aout_library(struct file *); @@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end) * macros to write out all the necessary info. */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} +#include #define DUMP_WRITE(addr, nr) \ if (!dump_write(file, (void *)(addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(offset) \ - if (file->f_op->llseek) { \ - if (file->f_op->llseek(file, (offset), 0) != (offset)) \ - goto end_coredump; \ - } else \ - file->f_pos = (offset) +#define DUMP_SEEK(offset) \ + if (!dump_seek(file, offset)) \ + goto end_coredump; #define START_DATA() (u.u_tsize << PAGE_SHIFT) #define START_STACK(u) (u.start_stack) @@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, dump_size = dump.u_ssize << PAGE_SHIFT; DUMP_WRITE(dump_start, dump_size); } - /* - * Finally dump the task struct. Not be used by gdb, but - * could be useful - */ - set_fs(KERNEL_DS); - DUMP_WRITE(current, sizeof(*current)); end_coredump: set_fs(fs); return has_dumped; -- cgit v1.2.3 From 9e9006e9090dc1f88d5127cb69f416013e7ecd60 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 14 Oct 2010 10:13:13 -0700 Subject: x86, olpc: XO-1 uses/depends on PCI olpc-xo1 uses pci_*() interfaces so it should depend on PCI. Otherwise we get build failure like: arch/x86/kernel/olpc-xo1.c:65: error: implicit declaration of function 'pci_enable_device_io' arch/x86/kernel/olpc-xo1.c:71: error: implicit declaration of function 'pci_request_region' arch/x86/kernel/olpc-xo1.c:80: error: implicit declaration of function 'pci_release_region' Signed-off-by: Randy Dunlap Acked-by: Daniel Drake Cc: Stephen Rothwell LKML-Reference: <20101014101313.adf7eb2a.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0bcb8765b1c0..c010b8d82712 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2068,7 +2068,7 @@ config OLPC config OLPC_XO1 tristate "OLPC XO-1 support" - depends on OLPC + depends on OLPC && PCI ---help--- Add support for non-essential features of the OLPC XO-1 laptop. -- cgit v1.2.3 From 940b3c7b193ec54141976a8642c00582f423690c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2010 19:36:26 +0200 Subject: x86: sfi: Make local functions static Signed-off-by: Thomas Gleixner Cc: Len Brown --- arch/x86/kernel/sfi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c index cb22acf3ed09..dd4c281ffe57 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/kernel/sfi.c @@ -34,7 +34,7 @@ #ifdef CONFIG_X86_LOCAL_APIC static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; -void __init mp_sfi_register_lapic_address(unsigned long address) +static void __init mp_sfi_register_lapic_address(unsigned long address) { mp_lapic_addr = address; @@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address) } /* All CPUs enumerated by SFI must be present and enabled */ -void __cpuinit mp_sfi_register_lapic(u8 id) +static void __cpuinit mp_sfi_register_lapic(u8 id) { if (MAX_APICS - id <= 0) { pr_warning("Processor #%d invalid (max %d)\n", -- cgit v1.2.3 From 40ffa93791985ab300fd488072e9f37ccf72e88c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2010 21:08:14 +0200 Subject: x86: Remove stale pmtimer_64.c This file is unused since the apic unification in 2.6.29, but nobody noticed. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/pmtimer_64.c | 69 -------------------------------------------- 2 files changed, 70 deletions(-) delete mode 100644 arch/x86/kernel/pmtimer_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0925676266bd..dd9a2e459c78 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -120,7 +120,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o - obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c deleted file mode 100644 index b112406f1996..000000000000 --- a/arch/x86/kernel/pmtimer_64.c +++ /dev/null @@ -1,69 +0,0 @@ -/* Ported over from i386 by AK, original copyright was: - * - * (C) Dominik Brodowski 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - * - * Dropped all the hardware bug workarounds for now. Hopefully they - * are not needed on 64bit chipsets. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -static unsigned pmtimer_wait_tick(void) -{ - u32 a, b; - for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; - a == b; - b = inl(pmtmr_ioport) & ACPI_PM_MASK) - cpu_relax(); - return b; -} - -/* note: wait time is rounded up to one tick */ -void pmtimer_wait(unsigned us) -{ - u32 a, b; - a = pmtimer_wait_tick(); - do { - b = inl(pmtmr_ioport); - cpu_relax(); - } while (cyc2us(b - a) < us); -} - -static int __init nopmtimer_setup(char *s) -{ - pmtmr_ioport = 0; - return 1; -} - -__setup("nopmtimer", nopmtimer_setup); -- cgit v1.2.3 From ba0cef3d149ce4db293c572bf36ed352b11ce7b9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 15 Oct 2010 15:15:01 +0200 Subject: perf_events: Fix bogus AMD64 generic TLB events PERF_COUNT_HW_CACHE_DTLB:READ:MISS had a bogus umask value of 0 which counts nothing. Needed to be 0x7 (to count all possibilities). PERF_COUNT_HW_CACHE_ITLB:READ:MISS had a bogus umask value of 0 which counts nothing. Needed to be 0x3 (to count all possibilities). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Robert Richter Cc: # as far back as it applies LKML-Reference: <4cb85478.41e9d80a.44e2.3f00@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index c2897b7b4a3b..46d58448c3af 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, -- cgit v1.2.3 From e360adbe29241a0194e10e20595360dd7b98a2b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 14:01:34 +0800 Subject: irq_work: Add generic hardirq context callbacks Provide a mechanism that allows running code in IRQ context. It is most useful for NMI code that needs to interact with the rest of the system -- like wakeup a task to drain buffers. Perf currently has such a mechanism, so extract that and provide it as a generic feature, independent of perf so that others may also benefit. The IRQ context callback is generated through self-IPIs where possible, or on architectures like powerpc the decrementer (the built-in timer facility) is set to generate an interrupt immediately. Architectures that don't have anything like this get to do with a callback from the timer tick. These architectures can call irq_work_run() at the tail of any IRQ handlers that might enqueue such work (like the perf IRQ handler) to avoid undue latencies in processing the work. Signed-off-by: Peter Zijlstra Acked-by: Kyle McMartin Acked-by: Martin Schwidefsky [ various fixes ] Signed-off-by: Huang Ying LKML-Reference: <1287036094.7768.291.camel@yhuang-dev> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/entry_arch.h | 4 ++-- arch/x86/include/asm/hardirq.h | 2 +- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/irq_vectors.h | 4 ++-- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/cpu/perf_event.c | 19 ------------------- arch/x86/kernel/entry_64.S | 6 +++--- arch/x86/kernel/irq.c | 8 ++++---- arch/x86/kernel/irq_work.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit.c | 6 +++--- 11 files changed, 48 insertions(+), 35 deletions(-) create mode 100644 arch/x86/kernel/irq_work.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9815221976a7..fd227d6b8d9c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -25,6 +25,7 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 8e8ec663a98f..b8e96a18676b 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_EVENTS -BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) +#ifdef CONFIG_IRQ_WORK +BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index aeab29aee617..55e4de613f0e 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,7 +14,7 @@ typedef struct { #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; - unsigned int apic_pending_irqs; + unsigned int apic_irq_work_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 46c0fe05f230..3a54a1ca1a02 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,7 +29,7 @@ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); extern void error_interrupt(void); -extern void perf_pending_interrupt(void); +extern void irq_work_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index e2ca30092557..6af0894dafb4 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -114,9 +114,9 @@ #define X86_PLATFORM_IPI_VECTOR 0xed /* - * Performance monitoring pending work vector: + * IRQ work vector: */ -#define LOCAL_PENDING_VECTOR 0xec +#define IRQ_WORK_VECTOR 0xec #define UV_BAU_MESSAGE 0xea diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9d3f485e5dd0..7490bf8d1459 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,6 +35,7 @@ obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e2513f26ba8b..fe73c1844a9a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1196,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) return handled; } -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_event_do_pending(); - irq_exit(); -} - -void set_perf_event_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 17be5ec7cbba..c375c79065f8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_EVENTS -apicinterrupt LOCAL_PENDING_VECTOR \ - perf_pending_interrupt smp_perf_pending_interrupt +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt #endif /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 91fd0c70a18a..44edb03fc9ec 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance monitoring interrupts\n"); - seq_printf(p, "%*s: ", prec, "PND"); + seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); - seq_printf(p, " Performance pending work\n"); + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; - sum += irq_stats(cpu)->apic_pending_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 000000000000..ca8f703a1e70 --- /dev/null +++ b/arch/x86/kernel/irq_work.c @@ -0,0 +1,30 @@ +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include +#include + +void smp_irq_work_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); + irq_exit(); +} + +void arch_irq_work_raise(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!cpu_has_apic) + return; + + apic->send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +#endif +} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 990ae7cfc578..713969b9266b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -224,9 +224,9 @@ static void __init apic_intr_init(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_EVENTS - alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); + /* IRQ work interrupts: */ +# ifdef CONFIG_IRQ_WORK + alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); # endif #endif -- cgit v1.2.3 From e82b8e4ea4f3dffe6e7939f90e78da675fcc450e Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:20 -0700 Subject: x86: Add IRQ_TIME_ACCOUNTING This patch adds IRQ_TIME_ACCOUNTING option on x86 and runtime enables it when TSC is enabled. This change just enables fine grained irq time accounting, isn't used yet. Following patches use it for different purposes. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-6-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 11 +++++++++++ arch/x86/kernel/tsc.c | 8 ++++++++ 2 files changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..f4c70c246ffe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -795,6 +795,17 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + default n + ---help--- + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + source "kernel/Kconfig.preempt" config X86_UP_APIC diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 26a863a9c2a8..a1c2cd768538 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,10 +104,14 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int no_sched_irq_time; + static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; + if (!strncmp(str, "noirqtime", 9)) + no_sched_irq_time = 1; return 1; } @@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason) if (!tsc_unstable) { tsc_unstable = 1; sched_clock_stable = 0; + disable_sched_clock_irqtime(); printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) @@ -987,6 +992,9 @@ void __init tsc_init(void) /* now allow native_sched_clock() to use rdtsc */ tsc_disabled = 0; + if (!no_sched_irq_time) + enable_sched_clock_irqtime(); + lpj = ((u64)tsc_khz * 1000); do_div(lpj, HZ); lpj_fine = lpj; -- cgit v1.2.3 From 9717967c4b704ce344c954afb5bb160aa9c01c34 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 18 Oct 2010 13:47:48 -0700 Subject: x86: ioapic: Call free_irte only if interrupt remapping enabled On a system that support intr-rempping when booting with "intremap=off" [ 177.895501] BUG: unable to handle kernel NULL pointer dereference at 00000000000000f8 [ 177.913316] IP: [] free_irte+0x47/0xc0 ... [ 178.173326] Call Trace: [ 178.173574] [] destroy_irq+0x3a/0x75 [ 178.192934] [] arch_teardown_msi_irq+0xe/0x10 [ 178.193418] [] arch_teardown_msi_irqs+0x56/0x7f [ 178.213021] [] free_msi_irqs+0x8d/0xeb Call free_irte only when interrupt remapping is enabled. Signed-off-by: Yinghai Lu LKML-Reference: <4CBCB274.7010108@kernel.org> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 20e47e045395..8ae808d110f4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3109,7 +3109,8 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - free_irte(irq); + if (intr_remapping_enabled) + free_irte(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); -- cgit v1.2.3 From 9581d442b9058d3699b4be568b6e5eae38a41493 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 19 Oct 2010 16:46:55 +0200 Subject: KVM: Fix fs/gs reload oops with invalid ldt kvm reloads the host's fs and gs blindly, however the underlying segment descriptors may be invalid due to the user modifying the ldt after loading them. Fix by using the safe accessors (loadsegment() and load_gs_index()) instead of home grown unsafe versions. This is CVE-2010-3698. KVM-Stable-Tag. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 24 ------------------------ arch/x86/kvm/svm.c | 15 ++++++++++----- arch/x86/kvm/vmx.c | 24 +++++++++--------------- 3 files changed, 19 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 502e53f999cf..c52e2eb40a1e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } -static inline u16 kvm_read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 kvm_read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void) return ldt; } -static inline void kvm_load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void kvm_load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 81ed28cb36e6..8a3f9f64f86f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = kvm_read_fs(); - gs_selector = kvm_read_gs(); + savesegment(fs, fs_selector); + savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ @@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_load_fs(fs_selector); - kvm_load_gs(gs_selector); - kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); + loadsegment(fs, fs_selector); +#ifdef CONFIG_X86_64 + load_gs_index(gs_selector); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, gs_selector); +#endif + kvm_load_ldt(ldt_selector); reload_tss(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 49b25eee25ac..7bddfab12013 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = kvm_read_fs(); + savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = kvm_read_gs(); + savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - unsigned long flags; - if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - kvm_load_fs(vmx->host_state.fs_sel); + loadsegment(fs, vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); + load_gs_index(vmx->host_state.gs_sel); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, vmx->host_state.gs_sel); #endif - local_irq_restore(flags); } reload_tss(); #ifdef CONFIG_X86_64 @@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); -- cgit v1.2.3 From 44235dcde416104b8e1db7606c283f4c0149c760 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 14 Oct 2010 17:04:59 -0700 Subject: x86, mm: Fix bogus whitespace in sync_global_pgds() Whitespace cleanup only. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 74f0f358b07b..1ad7c0ff5d2b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -103,28 +103,28 @@ __setup("noexec32=", nonx32_setup); */ void sync_global_pgds(unsigned long start, unsigned long end) { - unsigned long address; - - for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } } /* -- cgit v1.2.3 From 617d34d9e5d8326ec8f188c616aa06ac59d083fe Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 21 Sep 2010 12:01:51 -0700 Subject: x86, mm: Hold mm->page_table_lock while doing vmalloc_sync Take mm->page_table_lock while syncing the vmalloc region. This prevents a race with the Xen pagetable pin/unpin code, which expects that the page_table_lock is already held. If this race occurs, then Xen can see an inconsistent page type (a page can either be read/write or a pagetable page, and pin/unpin converts it between them), which will cause either the pin or the set_p[gm]d to fail; either will crash the kernel. vmalloc_sync_all() should be called rarely, so this extra use of page_table_lock should not interfere with its normal users. The mm pointer is stashed in the pgd page's index field, as that won't be otherwise used for pgds. Reported-by: Ian Campbell Originally-by: Jan Beulich LKML-Reference: <4CB88A4C.1080305@goop.org> Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/mm/fault.c | 11 ++++++++++- arch/x86/mm/init_64.c | 7 +++++++ arch/x86/mm/pgtable.c | 20 +++++++++++++++++--- 4 files changed, 36 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2d0a33bd2971..ada823a13c7c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; extern spinlock_t pgd_lock; extern struct list_head pgd_list; +extern struct mm_struct *pgd_page_get_mm(struct page *page); + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index caec22906d7c..6c27c39f8a37 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -229,7 +229,16 @@ void vmalloc_sync_all(void) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) + spinlock_t *pgt_lock; + int ret; + + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + + if (!ret) break; } spin_unlock_irqrestore(&pgd_lock, flags); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ad7c0ff5d2b..4d323fb770c2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -116,12 +116,19 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + spinlock_t *pgt_lock; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); } spin_unlock_irqrestore(&pgd_lock, flags); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..c70e57dbb491 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd) } /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) @@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) */ spin_lock_irqsave(&pgd_lock, flags); - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); spin_unlock_irqrestore(&pgd_lock, flags); -- cgit v1.2.3 From a68c439b1966c91f0ef474e2bf275d6792312726 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Oct 2010 12:27:53 +0200 Subject: apic, x86: Check if EILVT APIC registers are available (AMD only) This patch implements checks for the availability of LVT entries (APIC500-530) and reserves it if used. The check becomes necessary since we want to let the BIOS provide the LVT offsets. The offsets should be determined by the subsystems using it like those for MCE threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts are supported. Beginning with family 10h at least 4 offsets are available. Since offsets must be consistent for all cores, we keep track of the LVT offsets in software and reserve the offset for the same vector also to be used on other cores. An offset is freed by setting the entry to APIC_EILVT_MASKED. If the BIOS is right, there should be no conflicts. Otherwise a "[Firmware Bug]: ..." error message is generated. However, if software does not properly determines the offsets, it is not necessarily a BIOS bug. Signed-off-by: Robert Richter LKML-Reference: <1286360874-1471-2-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 1 + arch/x86/kernel/apic/apic.c | 83 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 75 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7fe3b3060f08..a859ca461fb0 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -131,6 +131,7 @@ #define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 +#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H #define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) #define APIC_EILVT_MSG_FIX 0x0 #define APIC_EILVT_MSG_SMI 0x2 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8cf86fb3b4e3..2bfeafd24f5c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -52,6 +52,7 @@ #include #include #include +#include unsigned int num_processors; @@ -370,24 +371,88 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) } /* - * Setup extended LVT, AMD specific (K8, family 10h) + * Setup extended LVT, AMD specific * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. + * Software should use the LVT offsets the BIOS provides. The offsets + * are determined by the subsystems using it like those for MCE + * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts + * are supported. Beginning with family 10h at least 4 offsets are + * available. * - * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. + * Since the offsets must be consistent for all cores, we keep track + * of the LVT offsets in software and reserve the offset for the same + * vector also to be used on other cores. An offset is freed by + * setting the entry to APIC_EILVT_MASKED. + * + * If the BIOS is right, there should be no conflicts. Otherwise a + * "[Firmware Bug]: ..." error message is generated. However, if + * software does not properly determines the offsets, it is not + * necessarily a BIOS bug. */ #define APIC_EILVT_LVTOFF_MCE 0 #define APIC_EILVT_LVTOFF_IBS 1 -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX]; + +static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) +{ + return (old & APIC_EILVT_MASKED) + || (new == APIC_EILVT_MASKED) + || ((new & ~APIC_EILVT_MASKED) == old); +} + +static unsigned int reserve_eilvt_offset(int offset, unsigned int new) +{ + unsigned int rsvd; /* 0: uninitialized */ + + if (offset >= APIC_EILVT_NR_MAX) + return ~0; + + rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED; + do { + if (rsvd && + !eilvt_entry_is_changeable(rsvd, new)) + /* may not change if vectors are different */ + return rsvd; + rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); + } while (rsvd != new); + + return new; +} + +/* + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. + */ + +static int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) { - unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); - unsigned int v = (mask << 16) | (msg_type << 8) | vector; + unsigned long reg = APIC_EILVTn(offset); + unsigned int new, old, reserved; + + new = (mask << 16) | (msg_type << 8) | vector; + old = apic_read(reg); + reserved = reserve_eilvt_offset(offset, new); - apic_write(reg, v); + if (reserved != new) { + pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " + "vector 0x%x was already reserved by another core, " + "APIC%lX=0x%x\n", + smp_processor_id(), new, reserved, reg, old); + return -EINVAL; + } + + if (!eilvt_entry_is_changeable(old, new)) { + pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " + "register already in use, APIC%lX=0x%x\n", + smp_processor_id(), new, reg, old); + return -EBUSY; + } + + apic_write(reg, new); + + return 0; } u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -- cgit v1.2.3 From 27afdf2008da0b8878a73e32e4eb12381b84e224 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Oct 2010 12:27:54 +0200 Subject: apic, x86: Use BIOS settings for IBS and MCE threshold interrupt LVT offsets We want the BIOS to setup the EILVT APIC registers. The offsets were hardcoded and BIOS settings were overwritten by the OS. Now, the subsystems for MCE threshold and IBS determine the LVT offset from the registers the BIOS has setup. If the BIOS setup is buggy on a family 10h system, a workaround enables IBS. If the OS determines an invalid register setup, a "[Firmware Bug]: " error message is reported. We need this change also for upcomming cpu families. Signed-off-by: Robert Richter LKML-Reference: <1286360874-1471-3-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 4 +- arch/x86/kernel/apic/apic.c | 19 +---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 27 ++++++- arch/x86/oprofile/op_model_amd.c | 145 ++++++++++++++++++++++++++++++----- 4 files changed, 154 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1fa03e04ae44..286de34b0ed6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void) } #endif -extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); -extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); - +extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2bfeafd24f5c..850657d1b0ed 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -390,9 +390,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * necessarily a BIOS bug. */ -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX]; static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) @@ -426,7 +423,7 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new) * enables the vector. See also the BKDGs. */ -static int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) +int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) { unsigned long reg = APIC_EILVTn(offset); unsigned int new, old, reserved; @@ -454,19 +451,7 @@ static int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) return 0; } - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} -EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); +EXPORT_SYMBOL_GPL(setup_APIC_eilvt); /* * Program the next event, relative to now diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 39aaee5c1ab2..80c482382d5c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; unsigned int bank, block; struct thresh_restart tr; - u8 lvt_off; + int lvt_off = -1; + u8 offset; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; #endif - lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0); + offset = (high & MASK_LVTOFF_HI) >> 20; + if (lvt_off < 0) { + if (setup_APIC_eilvt(offset, + THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) { + pr_err(FW_BUG "cpu %d, failed to " + "setup threshold interrupt " + "for bank %d, block %d " + "(MSR%08X=0x%x%08x)", + smp_processor_id(), bank, block, + address, high, low); + continue; + } + lvt_off = offset; + } else if (lvt_off != offset) { + pr_err(FW_BUG "cpu %d, invalid threshold " + "interrupt offset %d for bank %d," + "block %d (MSR%08X=0x%x%08x)", + smp_processor_id(), lvt_off, bank, + block, address, high, low); + continue; + } high &= ~MASK_LVTOFF_HI; high |= lvt_off << 20; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b67a6b5aa8d4..42fb46f83883 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -64,15 +64,22 @@ static u64 ibs_op_ctl; * IBS cpuid feature detection */ -#define IBS_CPUID_FEATURES 0x8000001b +#define IBS_CPUID_FEATURES 0x8000001b /* * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but * bit 0 is used to indicate the existence of IBS. */ -#define IBS_CAPS_AVAIL (1LL<<0) -#define IBS_CAPS_RDWROPCNT (1LL<<3) -#define IBS_CAPS_OPCNT (1LL<<4) +#define IBS_CAPS_AVAIL (1U<<0) +#define IBS_CAPS_RDWROPCNT (1U<<3) +#define IBS_CAPS_OPCNT (1U<<4) + +/* + * IBS APIC setup + */ +#define IBSCTL 0x1cc +#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) +#define IBSCTL_LVT_OFFSET_MASK 0x0F /* * IBS randomization macros @@ -266,6 +273,74 @@ static void op_amd_stop_ibs(void) wrmsrl(MSR_AMD64_IBSOPCTL, 0); } +static inline int eilvt_is_available(int offset) +{ + /* check if we may assign a vector */ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int ibs_eilvt_valid(void) +{ + u64 val; + int offset; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS " + "interrupt offset %d (MSR%08X=0x%016llx)", + smp_processor_id(), offset, + MSR_AMD64_IBSCTL, val); + return 0; + } + + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (eilvt_is_available(offset)) + return !0; + + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d " + "not available (MSR%08X=0x%016llx)", + smp_processor_id(), offset, + MSR_AMD64_IBSCTL, val); + + return 0; +} + +static inline int get_ibs_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, @@ -376,13 +451,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } if (ibs_caps) - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); + setup_APIC_ibs(); } static void op_amd_cpu_shutdown(void) { if (ibs_caps) - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); + clear_APIC_ibs(); } static int op_amd_check_ctrs(struct pt_regs * const regs, @@ -445,16 +520,11 @@ static void op_amd_stop(struct op_msrs const * const msrs) op_amd_stop_ibs(); } -static int __init_ibs_nmi(void) +static int setup_ibs_ctl(int ibs_eilvt_off) { -#define IBSCTL_LVTOFFSETVAL (1 << 8) -#define IBSCTL 0x1cc struct pci_dev *cpu_cfg; int nodes; u32 value = 0; - u8 ibs_eilvt_off; - - ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); nodes = 0; cpu_cfg = NULL; @@ -466,21 +536,60 @@ static int __init_ibs_nmi(void) break; ++nodes; pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off - | IBSCTL_LVTOFFSETVAL); + | IBSCTL_LVT_OFFSET_VALID); pci_read_config_dword(cpu_cfg, IBSCTL, &value); - if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { pci_dev_put(cpu_cfg); printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x", value); - return 1; + "IBSCTL = 0x%08x\n", value); + return -EINVAL; } } while (1); if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS"); - return 1; + printk(KERN_DEBUG "No CPU node configured for IBS\n"); + return -ENODEV; + } + + return 0; +} + +static int force_ibs_eilvt_setup(void) +{ + int i; + int ret; + + /* find the next free available EILVT entry */ + for (i = 1; i < 4; i++) { + if (!eilvt_is_available(i)) + continue; + ret = setup_ibs_ctl(i); + if (ret) + return ret; + return 0; } + printk(KERN_DEBUG "No EILVT entry available\n"); + + return -EBUSY; +} + +static int __init_ibs_nmi(void) +{ + int ret; + + if (ibs_eilvt_valid()) + return 0; + + ret = force_ibs_eilvt_setup(); + if (ret) + return ret; + + if (!ibs_eilvt_valid()) + return -EFAULT; + + pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); + return 0; } -- cgit v1.2.3 From f01f7c56a1425b9749a99af821e1de334fb64d7e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 19 Oct 2010 22:17:37 +0000 Subject: x86, mm: Fix incorrect data type in vmalloc_sync_all() arch/x86/mm/fault.c: In function 'vmalloc_sync_all': arch/x86/mm/fault.c:238: warning: assignment makes integer from pointer without a cast introduced by 617d34d9e5d8326ec8f188c616aa06ac59d083fe. Signed-off-by: Borislav Petkov LKML-Reference: <20101020103642.GA3135@kryptos.osrc.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6c27c39f8a37..0cdb8d493f61 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -230,7 +230,7 @@ void vmalloc_sync_all(void) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; - int ret; + pmd_t *ret; pgt_lock = &pgd_page_get_mm(page)->page_table_lock; -- cgit v1.2.3 From 932967202182743c01a2eee4bdfa2c42697bc586 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Oct 2010 11:07:03 +0800 Subject: x86: Spread tlb flush vector between nodes Currently flush tlb vector allocation is based on below equation: sender = smp_processor_id() % 8 This isn't optimal, CPUs from different node can have the same vector, this causes a lot of lock contention. Instead, we can assign the same vectors to CPUs from the same node, while different node has different vectors. This has below advantages: a. if there is lock contention, the lock contention is between CPUs from one node. This should be much cheaper than the contention between nodes. b. completely avoid lock contention between nodes. This especially benefits kswapd, which is the biggest user of tlb flush, since kswapd sets its affinity to specific node. In my test, this could reduce > 20% CPU overhead in extreme case.The test machine has 4 nodes and each node has 16 CPUs. I then bind each node's kswapd to the first CPU of the node. I run a workload with 4 sequential mmap file read thread. The files are empty sparse file. This workload will trigger a lot of page reclaim and tlbflush. The kswapd bind is to easy trigger the extreme tlb flush lock contention because otherwise kswapd keeps migrating between CPUs of a node and I can't get stable result. Sure in real workload, we can't always see so big tlb flush lock contention, but it's possible. [ hpa: folded in fix from Eric Dumazet to use this_cpu_read() ] Signed-off-by: Shaohua Li LKML-Reference: <1287544023.4571.8.camel@sli10-conroe.sh.intel.com> Cc: Eric Dumazet Signed-off-by: H. Peter Anvin --- arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..49358481c733 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,8 @@ union smp_flush_state { want false sharing in the per cpu data segment. */ static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; +static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); + /* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. @@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, union smp_flush_state *f; /* Caller has disabled preemption */ - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + sender = this_cpu_read(tlb_vector_offset); f = &flush_state[sender]; /* @@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask, flush_tlb_others_ipi(cpumask, mm, va); } +static void __cpuinit calculate_tlb_offset(void) +{ + int cpu, node, nr_node_vecs; + /* + * we are changing tlb_vector_offset for each CPU in runtime, but this + * will not cause inconsistency, as the write is atomic under X86. we + * might see more lock contentions in a short time, but after all CPU's + * tlb_vector_offset are changed, everything should go normal + * + * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might + * waste some vectors. + **/ + if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) + nr_node_vecs = 1; + else + nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; + + for_each_online_node(node) { + int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * + nr_node_vecs; + int cpu_offset = 0; + for_each_cpu(cpu, cpumask_of_node(node)) { + per_cpu(tlb_vector_offset, cpu) = node_offset + + cpu_offset; + cpu_offset++; + cpu_offset = cpu_offset % nr_node_vecs; + } + } +} + +static int tlb_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + switch (action & 0xf) { + case CPU_ONLINE: + case CPU_DEAD: + calculate_tlb_offset(); + } + return NOTIFY_OK; +} + static int __cpuinit init_smp_flush(void) { int i; @@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void) for (i = 0; i < ARRAY_SIZE(flush_state); i++) raw_spin_lock_init(&flush_state[i].tlbstate_lock); + calculate_tlb_offset(); + hotcpu_notifier(tlb_cpuhp_notify, 0); return 0; } core_initcall(init_smp_flush); -- cgit v1.2.3 From 66f2b061546974b96b7b238a92ce89a87ecf0754 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 20 Oct 2010 15:55:35 -0700 Subject: x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G Set CONFIG_ARCH_DMA_ADDR_T_64BIT when we set dma_addr_t to 64 bits in ; this allows Kconfig decisions based on this property. Signed-off-by: FUJITA Tomonori LKML-Reference: <201010202255.o9KMtZXu009370@imap1.linux-foundation.org> Acked-by: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..2924f4e77791 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1148,6 +1148,9 @@ config X86_PAE config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config ARCH_DMA_ADDR_T_64BIT + def_bool X86_64 || HIGHMEM64G + config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EMBEDDED default y -- cgit v1.2.3