diff options
Diffstat (limited to 'arch/x86/kernel')
80 files changed, 5606 insertions, 3049 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88d1bfc847d3..4f78bd682125 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp) obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit_$(BITS).o +obj-y += setup.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o @@ -44,6 +44,7 @@ obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o obj-$(CONFIG_X86_DS) += ds.o +obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f802..631086159c53 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -33,6 +33,7 @@ #include <linux/irq.h> #include <linux/bootmem.h> #include <linux/ioport.h> +#include <linux/pci.h> #include <asm/pgtable.h> #include <asm/io_apic.h> @@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (triggering == ACPI_LEVEL_SENSITIVE) + if (trigger == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } #endif #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -903,10 +904,8 @@ extern int es7000_plat; #endif static struct { - int apic_id; int gsi_base; int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; int mp_find_ioapic(int gsi) @@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].apicver = 0; -#endif + /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); @@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void) } } -int mp_register_gsi(u32 gsi, int triggering, int polarity) +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) { +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; int ioapic; - int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 + u8 pin; - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else + if (!acpi_ioapic) + return 0; + if (!dev) + return 0; + if (dev->bus != &pci_bus_type) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mp_ioapics[ioapic].apicid; + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + save_mp_irq(&mp_irq); +#endif + return 0; +} + +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + int ioapic; + int ioapic_pin; + struct io_apic_irq_attr irq_attr; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; -#endif /* Don't set up the ACPI SCI because it's already set up */ if (acpi_gbl_FADT.sci_interrupt == gsi) @@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) gsi = ioapic_renumber_irq(ioapic, gsi); #endif - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + "%d-%d\n", mp_ioapics[ioapic].apicid, ioapic_pin); return gsi; } - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else - return gsi; -#endif - } - - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} -int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - int ioapic; + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); - if (!acpi_ioapic) - return 0; + set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, + trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + io_apic_set_pci_routing(dev, gsi, &irq_attr); - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - save_mp_irq(&mp_irq); -#endif - return 0; + return gsi; } /* diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9def..167bc16ce0e5 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@ always := wakeup.bin targets := wakeup.elf wakeup.lds -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o +wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 000000000000..f51eb0bb56ce --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 000000000000..6206033ba202 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad52..1c60554537c3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,7 +55,16 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 + **pte_page, gfp_t gfp); +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages); +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif #ifdef CONFIG_AMD_IOMMU_STATS @@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) + for_each_iommu(iommu) iommu_poll_events(iommu); return IRQ_HANDLED; @@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid) __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, domid, 1, 1); - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { spin_lock_irqsave(&iommu->lock, flags); __iommu_queue_command(iommu, &cmd); __iommu_completion_wait(iommu); @@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid) } } +void amd_iommu_flush_all_domains(void) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + iommu_flush_domain(i); + } +} + +void amd_iommu_flush_all_devices(void) +{ + struct amd_iommu *iommu; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] == NULL) + continue; + + iommu = amd_iommu_rlookup_table[i]; + if (!iommu) + continue; + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom, unsigned long phys_addr, int prot) { - u64 __pte, *pte, *page; + u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); @@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom, if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, * as allocated in the aperture */ if (addr < dma_dom->aperture_size) - __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + __set_bit(addr >> PAGE_SHIFT, + dma_dom->aperture[0]->bitmap); } return 0; @@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, ****************************************************************************/ /* - * The address allocator core function. + * The address allocator core functions. * * called with domain->lock held */ + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64* fetch_pte(struct protection_domain *domain, + unsigned long address) +{ + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, + bool populate, gfp_t gfp) +{ + int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i; + +#ifdef CONFIG_IOMMU_STRESS + populate = false; +#endif + + if (index >= APERTURE_MAX_RANGES) + return -ENOMEM; + + dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); + if (!dma_dom->aperture[index]) + return -ENOMEM; + + dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); + if (!dma_dom->aperture[index]->bitmap) + goto out_free; + + dma_dom->aperture[index]->offset = dma_dom->aperture_size; + + if (populate) { + unsigned long address = dma_dom->aperture_size; + int i, num_ptes = APERTURE_RANGE_PAGES / 512; + u64 *pte, *pte_page; + + for (i = 0; i < num_ptes; ++i) { + pte = alloc_pte(&dma_dom->domain, address, + &pte_page, gfp); + if (!pte) + goto out_free; + + dma_dom->aperture[index]->pte_pages[i] = pte_page; + + address += APERTURE_RANGE_SIZE / 64; + } + } + + dma_dom->aperture_size += APERTURE_RANGE_SIZE; + + /* Intialize the exclusion range if necessary */ + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + /* + * Check for areas already mapped as present in the new aperture + * range and mark those pages as reserved in the allocator. Such + * mappings may already exist as a result of requested unity + * mappings for devices. + */ + for (i = dma_dom->aperture[index]->offset; + i < dma_dom->aperture_size; + i += PAGE_SIZE) { + u64 *pte = fetch_pte(&dma_dom->domain, i); + if (!pte || !IOMMU_PTE_PRESENT(*pte)) + continue; + + dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); + } + + return 0; + +out_free: + free_page((unsigned long)dma_dom->aperture[index]->bitmap); + + kfree(dma_dom->aperture[index]); + dma_dom->aperture[index] = NULL; + + return -ENOMEM; +} + +static unsigned long dma_ops_area_alloc(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages, + unsigned long align_mask, + u64 dma_mask, + unsigned long start) +{ + unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; + int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i = start >> APERTURE_RANGE_SHIFT; + unsigned long boundary_size; + unsigned long address = -1; + unsigned long limit; + + next_bit >>= PAGE_SHIFT; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + for (;i < max_index; ++i) { + unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + + if (dom->aperture[i]->offset >= dma_mask) + break; + + limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, + dma_mask >> PAGE_SHIFT); + + address = iommu_area_alloc(dom->aperture[i]->bitmap, + limit, next_bit, pages, 0, + boundary_size, align_mask); + if (address != -1) { + address = dom->aperture[i]->offset + + (address << PAGE_SHIFT); + dom->next_address = address + (pages << PAGE_SHIFT); + break; + } + + next_bit = 0; + } + + return address; +} + static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, unsigned long align_mask, u64 dma_mask) { - unsigned long limit; unsigned long address; - unsigned long boundary_size; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, - dma_mask >> PAGE_SHIFT); +#ifdef CONFIG_IOMMU_STRESS + dom->next_address = 0; + dom->need_flush = true; +#endif - if (dom->next_bit >= limit) { - dom->next_bit = 0; - dom->need_flush = true; - } + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, dom->next_address); - address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, align_mask); + dom->next_address = 0; + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, 0); dom->need_flush = true; } - if (likely(address != -1)) { - dom->next_bit = address + pages; - address <<= PAGE_SHIFT; - } else + if (unlikely(address == -1)) address = bad_dma_address; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) { - address >>= PAGE_SHIFT; - iommu_area_free(dom->bitmap, address, pages); + unsigned i = address >> APERTURE_RANGE_SHIFT; + struct aperture_range *range = dom->aperture[i]; - if (address >= dom->next_bit) + BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); + +#ifdef CONFIG_IOMMU_STRESS + if (i < 4) + return; +#endif + + if (address >= dom->next_address) dom->need_flush = true; + + address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + + iommu_area_free(range->bitmap, address, pages); + } /**************************************************************************** @@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) { - unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->bitmap, start_page, pages); + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } } static void free_pagetable(struct protection_domain *domain) @@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain) */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { + int i; + if (!dom) return; free_pagetable(&dom->domain); - kfree(dom->pte_pages); - - kfree(dom->bitmap); + for (i = 0; i < APERTURE_MAX_RANGES; ++i) { + if (!dom->aperture[i]) + continue; + free_page((unsigned long)dom->aperture[i]->bitmap); + kfree(dom->aperture[i]); + } kfree(dom); } @@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, - unsigned order) +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) { struct dma_ops_domain *dma_dom; - unsigned i, num_pte_pages; - u64 *l2_pde; - u64 address; - - /* - * Currently the DMA aperture must be between 32 MB and 1GB in size - */ - if ((order < 25) || (order > 30)) - return NULL; dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); if (!dma_dom) @@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = (1ULL << order); - dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), - GFP_KERNEL); - if (!dma_dom->bitmap) - goto free_dma_dom; - /* - * mark the first page as allocated so we never return 0 as - * a valid dma-address. So we can use 0 as error value - */ - dma_dom->bitmap[0] = 1; - dma_dom->next_bit = 0; dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); - } + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + goto free_dma_dom; /* - * At the last step, build the page tables so we don't need to - * allocate page table pages in the dma_ops mapping/unmapping - * path. + * mark the first page as allocated so we never return 0 as + * a valid dma-address. So we can use 0 as error value */ - num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), - GFP_KERNEL); - if (!dma_dom->pte_pages) - goto free_dma_dom; - - l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); - if (l2_pde == NULL) - goto free_dma_dom; + dma_dom->aperture[0]->bitmap[0] = 1; + dma_dom->next_address = 0; - dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); - - for (i = 0; i < num_pte_pages; ++i) { - dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->pte_pages[i]) - goto free_dma_dom; - address = virt_to_phys(dma_dom->pte_pages[i]); - l2_pde[i] = IOMMU_L1_PDE(address); - } return dma_dom; @@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb, struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; unsigned long flags; if (devid > amd_iommu_last_bdf) @@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb, "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { - case BUS_NOTIFY_BOUND_DRIVER: - if (domain) - goto out; - dma_domain = find_protection_domain(devid); - if (!dma_domain) - dma_domain = iommu->default_dom; - attach_device(iommu, &dma_domain->domain, devid); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", dma_domain->domain.id, dev_name(dev)); - break; - case BUS_NOTIFY_UNBIND_DRIVER: + case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; detach_device(domain, devid); @@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu, order); + dma_domain = dma_ops_domain_alloc(iommu); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", (*domain)->id, dev_name(dev)); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) @@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev, } /* + * If the pte_page is not yet allocated this function is called + */ +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 **pte_page, gfp_t gfp) +{ + u64 *pte, *page; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page) + *pte_page = pte; + + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, + unsigned long address) +{ + struct aperture_range *aperture; + u64 *pte, *pte_page; + + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return NULL; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) { + pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; + } else + pte += IOMMU_PTE_L0_INDEX(address); + + return pte; +} + +/* * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ @@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; - pte += IOMMU_PTE_L0_INDEX(address); + pte = dma_ops_get_pte(dom, address); + if (!pte) + return bad_dma_address; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) { + struct aperture_range *aperture; u64 *pte; if (address >= dom->aperture_size) return; - WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) + return; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); @@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev, u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; - dma_addr_t address, start; + dma_addr_t address, start, ret; unsigned int pages; unsigned long align_mask = 0; int i; @@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev, if (align) align_mask = (1UL << get_order(size)) - 1; +retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) - goto out; + if (unlikely(address == bad_dma_address)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the + * first run. This is a small optimization. + */ + dma_dom->next_address = dma_dom->aperture_size; + + if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + goto out; + + /* + * aperture was sucessfully enlarged by 128 MB, try + * allocation again + */ + goto retry; + } start = address; for (i = 0; i < pages; ++i) { - dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + if (ret == bad_dma_address) + goto out_unmap; + paddr += PAGE_SIZE; start += PAGE_SIZE; } @@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev, out: return address; + +out_unmap: + + for (--i; i >= 0; --i) { + start -= PAGE_SIZE; + dma_ops_domain_unmap(iommu, dma_dom, start); + } + + dma_ops_free_addresses(dma_dom, address, pages); + + return bad_dma_address; } /* @@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) + if (*dma_addr == bad_dma_address) { + spin_unlock_irqrestore(&domain->lock, flags); goto out_free; + } iommu_completion_wait(iommu); @@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void) iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - dma_dom = dma_ops_domain_alloc(iommu, order); + dma_dom = dma_ops_domain_alloc(iommu); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = { int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; int ret; /* @@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void) * found in the system. Devices not assigned to any other * protection domain will be assigned to the default one. */ - list_for_each_entry(iommu, &amd_iommu_list, list) { - iommu->default_dom = dma_ops_domain_alloc(iommu, order); + for_each_iommu(iommu) { + iommu->default_dom = dma_ops_domain_alloc(iommu); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void) free_domains: - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { if (iommu->default_dom) dma_ops_domain_free(iommu->default_dom); } @@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, old_domain = domain_for_device(devid); if (old_domain) - return -EBUSY; + detach_device(old_domain, devid); attach_device(iommu, domain, devid); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902dac..238989ec077d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -115,15 +115,21 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +bool amd_iommu_dump; + static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +#ifdef CONFIG_IOMMU_STRESS +bool amd_iommu_isolate = false; +#else bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ +#endif + bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid) static inline unsigned long tbl_size(int entry_size) { unsigned shift = PAGE_SHIFT + - get_order(amd_iommu_last_bdf * entry_size); + get_order(((int)amd_iommu_last_bdf + 1) * entry_size); return 1UL << shift; } @@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size) * This function set the exclusion range in the IOMMU. DMA accesses to the * exclusion range are passed through untranslated */ -static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +static void iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; @@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) } /* Generic functions to enable/disable certain features of the IOMMU. */ -static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -static void __init iommu_enable(struct amd_iommu *iommu) +static void iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); @@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } -/* Function to enable IOMMU event logging and event interrupts */ -static void __init iommu_enable_event_logging(struct amd_iommu *iommu) +static void iommu_disable(struct amd_iommu *iommu) { - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + iommu_feature_disable(iommu, CONTROL_IOMMU_EN); } /* @@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - entry = (u64)virt_to_phys(cmd_buf); + return cmd_buf; +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->cmd_buf == NULL); + + entry = (u64)virt_to_phys(iommu->cmd_buf); entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + &entry, sizeof(entry)); /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); - - return cmd_buf; } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu) /* allocates the memory where the IOMMU will log its events to */ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) { - u64 entry; iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(EVT_BUFFER_SIZE)); if (iommu->evt_buf == NULL) return NULL; + return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->evt_buf == NULL); + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); - iommu->evt_buf_size = EVT_BUFFER_SIZE; - - return iommu->evt_buf; + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } static void __init free_event_buffer(struct amd_iommu *iommu) @@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, p += sizeof(struct ivhd_header); end += h->length; + while (p < end) { e = (struct ivhd_entry *)p; switch (e->type) { case IVHD_DEV_ALL: + + DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" + " last device %02x:%02x.%x flags: %02x\n", + PCI_BUS(iommu->first_device), + PCI_SLOT(iommu->first_device), + PCI_FUNC(iommu->first_device), + PCI_BUS(iommu->last_device), + PCI_SLOT(iommu->last_device), + PCI_FUNC(iommu->last_device), + e->flags); + for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); break; case IVHD_DEV_SELECT: + + DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " + "flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: + + DUMP_printk(" DEV_SELECT_RANGE_START\t " + "devid: %02x:%02x.%x flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid_start = e->devid; flags = e->flags; ext_flags = 0; alias = false; break; case IVHD_DEV_ALIAS: + + DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " + "flags: %02x devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: + + DUMP_printk(" DEV_ALIAS_RANGE\t\t " + "devid: %02x:%02x.%x flags: %02x " + "devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid_start = e->devid; flags = e->flags; devid_to = e->ext >> 8; @@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, alias = true; break; case IVHD_DEV_EXT_SELECT: + + DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " + "flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: + + DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " + "%02x:%02x.%x flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid_start = e->devid; flags = e->flags; ext_flags = e->ext; alias = false; break; case IVHD_DEV_RANGE_END: + + DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid)); + devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) @@ -679,7 +774,7 @@ static void __init free_iommu_all(void) { struct amd_iommu *iommu, *next; - list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + for_each_iommu_safe(iommu, next) { list_del(&iommu->list); free_iommu_one(iommu); kfree(iommu); @@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->mmio_base) return -ENOMEM; - iommu_set_device_table(iommu); iommu->cmd_buf = alloc_command_buffer(iommu); if (!iommu->cmd_buf) return -ENOMEM; @@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table) h = (struct ivhd_header *)p; switch (*p) { case ACPI_IVHD_TYPE: + + DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + "seg: %d flags: %01x info %04x\n", + PCI_BUS(h->devid), PCI_SLOT(h->devid), + PCI_FUNC(h->devid), h->cap_ptr, + h->pci_seg, h->flags, h->info); + DUMP_printk(" mmio-addr: %016llx\n", + h->mmio_phys); + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); if (iommu == NULL) return -ENOMEM; @@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msix(struct amd_iommu *iommu) -{ - struct amd_iommu *curr; - struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ - int nvec = 0, i; - - list_for_each_entry(curr, &amd_iommu_list, list) { - if (curr->dev == iommu->dev) { - entries[nvec].entry = curr->evt_msi_num; - entries[nvec].vector = 0; - curr->int_enabled = true; - nvec++; - } - } - - if (pci_enable_msix(iommu->dev, entries, nvec)) { - pci_disable_msix(iommu->dev); - return 1; - } - - for (i = 0; i < nvec; ++i) { - int r = request_irq(entries->vector, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD IOMMU", - NULL); - if (r) - goto out_free; - } - - return 0; - -out_free: - for (i -= 1; i >= 0; --i) - free_irq(entries->vector, NULL); - - pci_disable_msix(iommu->dev); - - return 1; -} - static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; - struct amd_iommu *curr; - - list_for_each_entry(curr, &amd_iommu_list, list) { - if (curr->dev == iommu->dev) - curr->int_enabled = true; - } - if (pci_enable_msi(iommu->dev)) return 1; @@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu->int_enabled = true; + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + return 0; } -static int __init iommu_init_msi(struct amd_iommu *iommu) +static int iommu_init_msi(struct amd_iommu *iommu) { if (iommu->int_enabled) return 0; - if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) - return iommu_setup_msix(iommu); - else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) return iommu_setup_msi(iommu); return 1; @@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; + char *s; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) @@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: + kfree(e); + return 0; case ACPI_IVMD_TYPE: + s = "IVMD_TYPEi\t\t\t"; e->devid_start = e->devid_end = m->devid; break; case ACPI_IVMD_TYPE_ALL: + s = "IVMD_TYPE_ALL\t\t"; e->devid_start = 0; e->devid_end = amd_iommu_last_bdf; break; case ACPI_IVMD_TYPE_RANGE: + s = "IVMD_TYPE_RANGE\t\t"; e->devid_start = m->devid; e->devid_end = m->aux; break; @@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m) e->address_end = e->address_start + PAGE_ALIGN(m->range_length); e->prot = m->flags >> 1; + DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" + " range_start: %016llx range_end: %016llx flags: %x\n", s, + PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), + PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), + PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), + e->address_start, e->address_end, m->flags); + list_add_tail(&e->list, &amd_iommu_unity_map); return 0; @@ -967,18 +1037,28 @@ static void init_device_table(void) * This function finally enables all IOMMUs found in the system after * they have been initialized */ -static void __init enable_iommus(void) +static void enable_iommus(void) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { + iommu_set_device_table(iommu); + iommu_enable_command_buffer(iommu); + iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); - iommu_enable_event_logging(iommu); iommu_enable(iommu); } } +static void disable_iommus(void) +{ + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_disable(iommu); +} + /* * Suspend/Resume support * disable suspend until real resume implemented @@ -986,12 +1066,31 @@ static void __init enable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + /* + * Disable IOMMUs before reprogramming the hardware registers. + * IOMMU is still enabled from the resume kernel. + */ + disable_iommus(); + + /* re-load the hardware */ + enable_iommus(); + + /* + * we have to flush after the IOMMUs are enabled because a + * disabled IOMMU will never execute the commands we send + */ + amd_iommu_flush_all_domains(); + amd_iommu_flush_all_devices(); + return 0; } static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) { - return -EINVAL; + /* disable IOMMUs to go out of the way for BIOS */ + disable_iommus(); + + return 0; } static struct sysdev_class amd_iommu_sysdev_class = { @@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void) enable_iommus(); - printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", - (1 << (amd_iommu_aperture_order-20))); - printk(KERN_INFO "AMD IOMMU: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); @@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void) * ****************************************************************************/ +static int __init parse_amd_iommu_dump(char *str) +{ + amd_iommu_dump = true; + + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str) return 1; } -static int __init parse_amd_iommu_size_options(char *str) -{ - unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); - - if ((order > 24) && (order < 31)) - amd_iommu_aperture_order = order; - - return 1; -} - +__setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); -__setup("amd_iommu_size=", parse_amd_iommu_size_options); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f246..076d3881f3da 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include <linux/perf_counter.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> @@ -34,6 +35,7 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <asm/perf_counter.h> #include <asm/pgalloc.h> #include <asm/atomic.h> #include <asm/mpspec.h> @@ -98,6 +100,29 @@ early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline void imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline void imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} #endif #ifdef CONFIG_X86_64 @@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +int x2apic_mode; #ifdef CONFIG_X86_X2APIC -int x2apic; /* x2apic enabled before OS handover */ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { + if (x2apic_enabled()) { + pr_warning("Bios already enabled x2apic, " + "can't enforce nox2apic"); + return 0; + } + disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; @@ -209,6 +240,31 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * bare function to substitute write operation + * and it's _that_ fast :) + */ +static void native_apic_write_dummy(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +static u32 native_apic_read_dummy(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +/* + * right after this call apic->write/read doesn't do anything + * note that there is no restore operation it works one way + */ +void apic_disable(void) +{ + apic->read = native_apic_read_dummy; + apic->write = native_apic_write_dummy; +} + void native_apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) { - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); @@ -815,7 +871,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!x2apic && !apic_phys) + if (!x2apic_mode && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif + perf_counters_lapic_init(); preempt_disable(); @@ -1287,7 +1344,7 @@ void check_x2apic(void) { if (x2apic_enabled()) { pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic = 1; + x2apic_preenabled = x2apic_mode = 1; } } @@ -1295,7 +1352,7 @@ void enable_x2apic(void) { int msr, msr2; - if (!x2apic) + if (!x2apic_mode) return; rdmsr(MSR_IA32_APICBASE, msr, msr2); @@ -1304,6 +1361,7 @@ void enable_x2apic(void) wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } +#endif /* CONFIG_X86_X2APIC */ void __init enable_IR_x2apic(void) { @@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; - if (!cpu_has_x2apic) - return; - - if (!x2apic_preenabled && disable_x2apic) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); - return; + ret = dmar_table_init(); + if (ret) { + pr_debug("dmar_table_init() failed with %d:\n", ret); + goto ir_failed; } - if (x2apic_preenabled && disable_x2apic) - panic("Bios already enabled x2apic, can't enforce nox2apic"); - - if (!x2apic_preenabled && skip_ioapic_setup) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); - return; + if (!intr_remapping_supported()) { + pr_debug("intr-remapping not supported\n"); + goto ir_failed; } - ret = dmar_table_init(); - if (ret) { - pr_info("dmar_table_init() failed with %d:\n", ret); - if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); - else - pr_info("Not enabling x2apic,Intr-remapping\n"); + if (!x2apic_preenabled && skip_ioapic_setup) { + pr_info("Skipped enabling intr-remap because of skipping " + "io-apic setup\n"); return; } @@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - - if (ret && x2apic_preenabled) { - local_irq_restore(flags); - panic("x2apic enabled by bios. But IR enabling failed"); - } - + ret = enable_intr_remapping(x2apic_supported()); if (ret) goto end_restore; - if (!x2apic) { - x2apic = 1; + pr_info("Enabled Interrupt-remapping\n"); + + if (x2apic_supported() && !x2apic_mode) { + x2apic_mode = 1; enable_x2apic(); + pr_info("Enabled x2apic\n"); } end_restore: @@ -1378,37 +1422,34 @@ end_restore: * IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - else - reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries); unmask_8259A(); local_irq_restore(flags); end: - if (!ret) { - if (!x2apic_preenabled) - pr_info("Enabled x2apic and interrupt-remapping\n"); - else - pr_info("Enabled Interrupt-remapping\n"); - } else - pr_err("Failed to enable Interrupt-remapping and x2apic\n"); if (ioapic_entries) free_ioapic_entries(ioapic_entries); + + if (!ret) + return; + +ir_failed: + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else if (cpu_has_x2apic) + pr_info("Not enabling x2apic,Intr-remapping\n"); #else if (!cpu_has_x2apic) return; if (x2apic_preenabled) panic("x2apic enabled prior OS handover," - " enable CONFIG_INTR_REMAP"); - - pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); #endif return; } -#endif /* CONFIG_X86_X2APIC */ + #ifdef CONFIG_X86_64 /* @@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void) } mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; return 0; } #else @@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { - if (x2apic) { + unsigned int new_apicid; + + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; } - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ + /* If no local APIC can be found return early */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else + /* lets NOP'ify apic operations */ + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } else { apic_phys = mp_lapic_addr; - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); + } /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ + apic_version[new_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } } /* @@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void) */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); + imcr_pic_to_apic(); } #endif if (apic->enable_apic_mode) @@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); + imcr_apic_to_pic(); return; } #endif @@ -1969,10 +2024,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) local_irq_save(flags); disable_local_APIC(); -#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) disable_intr_remapping(); -#endif + local_irq_restore(flags); return 0; } @@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - -#ifdef CONFIG_INTR_REMAP - int ret; + int ret = 0; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) return 0; local_irq_save(flags); - if (x2apic) { + if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - return -ENOMEM; + ret = -ENOMEM; + goto restore; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { WARN(1, "Saving IO-APIC state failed: %d\n", ret); free_ioapic_entries(ioapic_entries); - return ret; + goto restore; } mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - enable_x2apic(); } -#else - if (!apic_pm_state.active) - return 0; - local_irq_save(flags); - if (x2apic) + if (x2apic_mode) enable_x2apic(); -#endif - else { /* * Make sure the APICBASE points to the right address @@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - reenable_intr_remapping(EIM_32BIT_APIC_ID); - - if (x2apic) { + if (intr_remapping_enabled) { + reenable_intr_remapping(x2apic_mode); unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } -#endif - +restore: local_irq_restore(flags); - - return 0; + return ret; } /* @@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ #ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) + +static int __cpuinit apic_cluster_num(void) { int i, clusters, zeros; unsigned id; u16 *bios_cpu_apicid; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); @@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void) ++zeros; } - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) + return clusters; +} + +static int __cpuinitdata multi_checked; +static int __cpuinitdata multi; + +static int __cpuinit set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void __cpuinit dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +__cpuinit int apic_is_clustered_box(void) +{ + dmi_check_multi(); + if (multi) return 1; + if (!is_vsmp_box()) + return 0; + /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. + * ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards */ - return (clusters > 2); + if (apic_cluster_num() > 1) + return 1; + + return 0; } #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e88fb6f..d0c99abc26c3 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -161,7 +161,7 @@ static int flat_apic_id_registered(void) static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return initial_apic_id >> index_msb; } struct apic apic_flat = { @@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * regardless of how many processors are present (x86_64 ES7000 * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 302947775575..69328ac8de9c 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617d18e4..ef8d9290c7ea 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include <asm/setup.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> +#include <asm/hw_irq.h> #include <asm/uv/uv_hub.h> #include <asm/uv/uv_irq.h> @@ -129,12 +130,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node) { struct irq_pin_list *pin; - int node; - - node = cpu_to_node(cpu); pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -148,9 +146,6 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - u8 move_desc_pending : 1; -#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -182,16 +177,18 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; struct irq_desc *desc; int count; + int node; int i; cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); + node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_bootmem_cpumask_var(&cfg[i].domain); - alloc_bootmem_cpumask_var(&cfg[i].old_domain); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } @@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) return cfg; } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node) { struct irq_cfg *cfg; - int node; - - node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { @@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; cfg = desc->chip_data; if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(cpu); + desc->chip_data = get_one_free_irq_cfg(node); if (!desc->chip_data) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); @@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) return 0; } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */ static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) { struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) if (!old_entry) return; - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) return; @@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { entry = head; while (entry) { @@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) } void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(cpu); + cfg = get_one_free_irq_cfg(node); if (!cfg) return; @@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); - init_copy_irq_2_pin(old_cfg, cfg, cpu); + init_copy_irq_2_pin(old_cfg, cfg, node); } static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->chip_data = NULL; } } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg = desc->chip_data; - - if (!cfg->move_in_progress) { - /* it means that domain is not changed */ - if (!cpumask_intersects(desc->affinity, mask)) - cfg->move_desc_pending = 1; - } -} -#endif +/* end for move_irq_desc */ #else static struct irq_cfg *irq_cfg(unsigned int irq) @@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq) #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -518,132 +492,18 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned int irq; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; - - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; - - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - - cpumask_copy(desc->affinity, mask); - - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list *entry; entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", apic, pin); @@ -663,7 +523,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(cpu); + entry->next = get_one_free_irq_2_pin(node); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -672,7 +532,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { @@ -692,7 +552,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); + add_pin_to_irq_node(cfg, node, newapic, newpin); } static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -850,7 +710,6 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; @@ -948,20 +807,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries) - -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(ioapic_entries); -} - void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; @@ -971,7 +816,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) kfree(ioapic_entries); } -#endif /* * Find the IRQ entry number of a certain pin. @@ -1032,54 +876,6 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1298,6 +1094,64 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + struct io_apic_irq_attr *irq_attr) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + void lock_vector_lock(void) { /* Used to the online set of cpus does not change @@ -1628,58 +1482,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq ioapic_write_entry(apic_id, pin, entry); } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; + int apic_id = 0, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { - - idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); - continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + apic_id = mp_find_ioapic(0); + if (apic_id < 0) + apic_id = 0; + } +#endif - irq = pin_2_irq(idx, apic_id, pin); + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic_id].apicid, pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic_id].apicid, pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) - continue; + irq = pin_2_irq(idx, apic_id, pin); - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); - continue; - } - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) + continue; - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; } + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + /* + * don't mark it in pin_programmed, so later acpi could + * set it correctly when irq < 16 + */ + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); } if (notcon) @@ -1869,7 +1735,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { - unsigned int v, ver, maxlvt; + unsigned int i, v, ver, maxlvt; u64 icr; if (apic_verbosity == APIC_QUIET) @@ -1957,6 +1823,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); v = apic_read(APIC_TDCR); printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } printk("\n"); } @@ -2005,6 +1883,11 @@ __apicdebuginit(void) print_PIC(void) __apicdebuginit(int) print_all_ICs(void) { print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic || disable_apic) + return 0; + print_all_local_APICs(); print_IO_APIC(); @@ -2360,6 +2243,118 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static int +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + int ret = -1; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + ret = 0; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + +static int +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + return set_ioapic_affinity_irq_desc(desc, mask); +} #ifdef CONFIG_INTR_REMAP @@ -2374,26 +2369,25 @@ static int ioapic_retrigger_irq(unsigned int irq) * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. */ -static void +static int migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; unsigned int dest; unsigned int irq; + int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return; + return ret; irq = desc->irq; if (get_irte(irq, &irte)) - return; + return ret; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); + return ret; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2409,27 +2403,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) send_cleanup_vector(cfg); cpumask_copy(desc->affinity, mask); + + return 0; } /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - migrate_ioapic_irq_desc(desc, mask); + return migrate_ioapic_irq_desc(desc, mask); } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, mask); + return set_ir_ioapic_affinity_irq_desc(desc, mask); } #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { + return 0; } #endif @@ -2491,86 +2488,19 @@ static void irq_complete_move(struct irq_desc **descp) struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - if (likely(!cfg->move_desc_pending)) - return; - - /* domain has not changed, but affinity did */ - me = smp_processor_id(); - if (cpumask_test_cpu(me, desc->affinity)) { - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; - cfg->move_desc_pending = 0; - } -#endif + if (likely(!cfg->move_in_progress)) return; - } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; -#endif + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); - } } #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - ack_x2APIC_irq(); - eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2634,9 +2564,6 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - if (irq_remapped(irq)) - eoi_ioapic_irq(desc); - /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2683,22 +2610,50 @@ static void ack_apic_level(unsigned int irq) } #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ir_ack_apic_edge(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_edge(irq); -#endif - return ack_apic_edge(irq); + ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_level(irq); -#endif - return ack_apic_level(irq); + struct irq_desc *desc = irq_to_desc(irq); + + ack_APIC_irq(); + eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ @@ -2903,7 +2858,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2969,7 +2924,7 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); + add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { /* for edge trigger, setup_IO_APIC_irq already @@ -3006,7 +2961,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { @@ -3218,14 +3173,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node) { /* Allocate an unused irq */ unsigned int irq; unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int cpu = boot_cpu_id; struct irq_desc *desc_new = NULL; irq = 0; @@ -3234,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want) spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_cpu(new, cpu); + desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; @@ -3243,6 +3197,9 @@ unsigned int create_irq_nr(unsigned int irq_want) if (cfg_new->vector != 0) continue; + + desc_new = move_irq_desc(desc_new, node); + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; @@ -3260,11 +3217,12 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { + int node = cpu_to_node(boot_cpu_id); unsigned int irq_want; int irq; irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) irq = -1; @@ -3366,7 +3324,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3375,7 +3333,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3387,13 +3345,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); + + return 0; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void +static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@ -3402,11 +3362,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irte irte; if (get_irte(irq, &irte)) - return; + return -1; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3423,6 +3383,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) */ if (cfg->move_in_progress) send_cleanup_vector(cfg); + + return 0; } #endif @@ -3518,15 +3480,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) unsigned int irq_want; struct intel_iommu *iommu = NULL; int index = 0; + int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; + node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) return -1; irq_want = irq + 1; @@ -3576,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq) #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3585,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3597,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3630,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3639,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3651,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3707,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3715,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); + + return 0; } #endif @@ -3794,6 +3764,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3807,15 +3779,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3833,10 +3803,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) struct uv_IO_APIC_route_entry *entry; int mmr_pnode; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - entry->mask = 1; mmr_pnode = uv_blade_to_pnode(mmr_blade); @@ -3900,6 +3870,71 @@ int __init arch_probe_nr_irqs(void) } #endif +static int __io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + struct irq_desc *desc; + struct irq_cfg *cfg; + int node; + int ioapic, pin; + int trigger, polarity; + + ioapic = irq_attr->ioapic; + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + + pin = irq_attr->ioapic_pin; + trigger = irq_attr->trigger; + polarity = irq_attr->polarity; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, ioapic, pin); + } + + setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); + + return 0; +} + +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + int ioapic, pin; + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + ioapic = irq_attr->ioapic; + pin = irq_attr->ioapic_pin; + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, irq, irq_attr); +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3980,6 +4015,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } +#endif int __init io_apic_get_version(int ioapic) { @@ -3992,39 +4028,6 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -#endif - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - struct irq_desc *desc; - struct irq_cfg *cfg; - int cpu = boot_cpu_id; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); - return 0; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= NR_IRQS_LEGACY) { - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); - } - - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - - return 0; -} - int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { @@ -4055,51 +4058,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic, irq, irq_entry; + int pin, ioapic = 0, irq, irq_entry; struct irq_desc *desc; - struct irq_cfg *cfg; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, desc, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - continue; +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + ioapic = 0; + } +#endif - } + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); - /* - * Honour affinities which have been set in early boot - */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; - else - mask = apic->target_cpus(); + desc = irq_to_desc(irq); - if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); - else - set_ioapic_affinity_irq_desc(desc, mask); - } + /* + * Honour affinities which have been set in early boot + */ + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = apic->target_cpus(); + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq_desc(desc, mask); + else + set_ioapic_affinity_irq_desc(desc, mask); } + } #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a1..a691302dc3ff 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e4..440a8bccd91a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -160,7 +160,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652bb0e5..bc3e880f9b82 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic && (apic != &apic_x2apic_phys && + if (x2apic_mode && (apic != &apic_x2apic_phys && #ifdef CONFIG_X86_UV apic != &apic_x2apic_uv_x && #endif diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f415d81..344eee4ac0a4 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); } - -/* In clustered mode, the high nibble of APIC ID is a cluster number. - * The low nibble is a 4-bit bitmap. */ -#define XAPIC_DEST_CPUS_SHIFT 4 -#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) -#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) - #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) static const struct cpumask *summit_target_cpus(void) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d17..8e4cbb255c38 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@ #include <asm/apic.h> #include <asm/ipi.h> -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda69352976..ef0ae207a7c8 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; @@ -562,7 +562,7 @@ void __init uv_system_init(void) union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int max_pnode = 0; + int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; @@ -574,6 +574,13 @@ void __init uv_system_init(void) mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", + n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -583,15 +590,18 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info); get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_node_to_blade); memset(uv_node_to_blade, 255, bytes); bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_cpu_to_blade); memset(uv_cpu_to_blade, 255, bytes); blade = 0; @@ -607,11 +617,6 @@ void __init uv_system_init(void) } } - pnode_mask = (1 << n_val) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_upper = (((unsigned long)node_id.s.node_id) & - ~((1 << n_val) - 1)) << m_val; - uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size); @@ -634,6 +639,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162f..1a830cbd7015 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -146,4 +146,5 @@ void foo(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b5..898ecc47e129 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e4..3efcb2b96a15 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@ # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks # # Don't trace early stages of a secondary CPU boot @@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_X86_MCE) += mcheck/ -obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa64..e5b27d8f1b47 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/pci-direct.h> #ifdef CONFIG_X86_64 # include <asm/numa_64.h> @@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = hard_smp_processor_id(); + unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) @@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* check CPU config space for extended APIC ID */ + if (c->x86 >= 0xf) { + unsigned int val; + val = read_pci_config(0, 24, 0, 0x68); + if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 77848d9fca68..3ffdcfa9abdf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include <linux/io.h> #include <asm/stackprotector.h> +#include <asm/perf_counter.h> #include <asm/mmu_context.h> #include <asm/hypervisor.h> #include <asm/processor.h> @@ -299,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -768,6 +770,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif @@ -813,6 +821,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -825,10 +843,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); @@ -861,6 +875,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + init_hw_perf_counters(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6a..6b2a52dd0403 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@ static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag); static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model); static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = { { "value", CPU_REG_ALL, 1 }, }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, - { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, - { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, - { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, - - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, - { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, - { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, - - { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, - { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, - { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, - { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, - - { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, - - { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, - { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, - { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, - { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, - { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, - { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, - { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, - { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, - { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, - - { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, - { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, - { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, - { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, - { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, - { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, - { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, - { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, - { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, - { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, - - { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, - { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, - { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, - { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, - { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, - { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, - { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, - - { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, - - { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, - { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { + { 0x00000000, 0x00000001, CPU_MC, }, + { 0x00000006, 0x00000007, CPU_MONITOR, }, + { 0x00000010, 0x00000010, CPU_TIME, }, + { 0x00000011, 0x00000013, CPU_PMC, }, + { 0x00000017, 0x00000017, CPU_PLATFORM, }, + { 0x0000001B, 0x0000001B, CPU_APIC, }, + { 0x0000002A, 0x0000002B, CPU_POWERON, }, + { 0x0000002C, 0x0000002C, CPU_FREQ, }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, }, + { 0x00000040, 0x00000047, CPU_LBRANCH, }, + { 0x00000060, 0x00000067, CPU_LBRANCH, }, + { 0x00000079, 0x00000079, CPU_BIOS, }, + { 0x00000088, 0x0000008A, CPU_CACHE, }, + { 0x0000008B, 0x0000008B, CPU_BIOS, }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, }, + { 0x000000C1, 0x000000C4, CPU_PMC, }, + { 0x000000CD, 0x000000CD, CPU_FREQ, }, + { 0x000000E7, 0x000000E8, CPU_PERF, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, }, + + { 0x00000116, 0x0000011E, CPU_CACHE, }, + { 0x00000174, 0x00000176, CPU_SYSENTER, }, + { 0x00000179, 0x0000017B, CPU_MC, }, + { 0x00000186, 0x00000189, CPU_PMC, }, + { 0x00000198, 0x00000199, CPU_PERF, }, + { 0x0000019A, 0x0000019A, CPU_TIME, }, + { 0x0000019B, 0x0000019D, CPU_THERM, }, + { 0x000001A0, 0x000001A0, CPU_MISC, }, + { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, }, + { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, }, + { 0x00000250, 0x00000250, CPU_MTRR, }, + { 0x00000258, 0x00000259, CPU_MTRR, }, + { 0x00000268, 0x0000026F, CPU_MTRR, }, + { 0x00000277, 0x00000277, CPU_PAT, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, }, + + { 0x00000300, 0x00000311, CPU_PMC, }, + { 0x00000345, 0x00000345, CPU_PMC, }, + { 0x00000360, 0x00000371, CPU_PMC, }, + { 0x0000038D, 0x00000390, CPU_PMC, }, + { 0x000003A0, 0x000003BE, CPU_PMC, }, + { 0x000003C0, 0x000003CD, CPU_PMC, }, + { 0x000003E0, 0x000003E1, CPU_PMC, }, + { 0x000003F0, 0x000003F2, CPU_PMC, }, + + { 0x00000400, 0x00000417, CPU_MC, }, + { 0x00000480, 0x0000048B, CPU_VMX, }, + + { 0x00000600, 0x00000600, CPU_DEBUG, }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, + + { 0x000107CC, 0x000107D3, CPU_PMC, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, }, + { 0xC0000081, 0xC0000084, CPU_CALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, }, + { 0xC0000103, 0xC0000103, CPU_TIME, }, + + { 0xC0010000, 0xC0010007, CPU_PMC, }, + { 0xC0010010, 0xC0010010, CPU_CONF, }, + { 0xC0010015, 0xC0010015, CPU_CONF, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, }, + { 0xC001001F, 0xC001001F, CPU_CONF, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, }, + { 0xC0010044, 0xC0010048, CPU_MC, }, + { 0xC0010050, 0xC0010056, CPU_SMM, }, + { 0xC0010058, 0xC0010058, CPU_CONF, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, }, + { 0xC0010061, 0xC0010068, CPU_SMM, }, + { 0xC0010069, 0xC001006B, CPU_SMM, }, + { 0xC0010070, 0xC0010071, CPU_SMM, }, + { 0xC0010111, 0xC0010113, CPU_SMM, }, + { 0xC0010114, 0xC0010118, CPU_SVM, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, }, + { 0xC0011022, 0xC0011023, CPU_CONF, }, }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, - { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, - { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, - { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, - { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, - { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, - { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, - { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, - { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, - { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, - { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ - int flag; - - switch (model) { - case 0x0501: - case 0x0502: - case 0x0504: - flag = CPU_INTEL_PENTIUM; - break; - case 0x0601: - case 0x0603: - case 0x0605: - case 0x0607: - case 0x0608: - case 0x060A: - case 0x060B: - flag = CPU_INTEL_P6; - break; - case 0x0609: - case 0x060D: - flag = CPU_INTEL_PENTIUM_M; - break; - case 0x060E: - flag = CPU_INTEL_CORE; - break; - case 0x060F: - case 0x0617: - flag = CPU_INTEL_CORE2; - break; - case 0x061C: - flag = CPU_INTEL_ATOM; - break; - case 0x0F00: - case 0x0F01: - case 0x0F02: - case 0x0F03: - case 0x0F04: - flag = CPU_INTEL_XEON_P4; - break; - case 0x0F06: - flag = CPU_INTEL_XEON_MP; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ - int flag; - - switch (model >> 8) { - case 0x6: - flag = CPU_AMD_K6; - break; - case 0x7: - flag = CPU_AMD_K7; - break; - case 0x8: - flag = CPU_AMD_K8; - break; - case 0xf: - flag = CPU_AMD_0F; - break; - case 0x10: - flag = CPU_AMD_10; - break; - case 0x11: - flag = CPU_AMD_11; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ - int flag; - - flag = per_cpu(cpu_model, cpu); - - switch (flag >> 16) { - case X86_VENDOR_INTEL: - flag = get_intel_modelflag(flag); - break; - case X86_VENDOR_AMD: - flag = get_amd_modelflag(flag & 0xffff); - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ - int index; - - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - index = ARRAY_SIZE(cpu_intel_range); - break; - case X86_VENDOR_AMD: - index = ARRAY_SIZE(cpu_amd_range); - break; - default: - index = 0; - break; - } - - return index; -} - static int is_typeflag_valid(unsigned cpu, unsigned flag) { - unsigned vendor, modelflag; - int i, index; + int i; /* Standard Registers should be always valid */ if (flag >= CPU_TSS) return 1; - modelflag = per_cpu(cpu_modelflag, cpu); - vendor = per_cpu(cpu_model, cpu) >> 16; - index = get_cpu_range_count(cpu); - - for (i = 0; i < index; i++) { - switch (vendor) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[i].model & modelflag) && - (cpu_intel_range[i].flag & flag)) - return 1; - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[i].model & modelflag) && - (cpu_amd_range[i].flag & flag)) - return 1; - break; - } + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { + if (cpu_reg_range[i].flag == flag) + return 1; } /* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, int index, unsigned flag) { - unsigned modelflag; - - modelflag = per_cpu(cpu_modelflag, cpu); - *max = 0; - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[index].model & modelflag) && - (cpu_intel_range[index].flag & flag)) { - *min = cpu_intel_range[index].min; - *max = cpu_intel_range[index].max; - } - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[index].model & modelflag) && - (cpu_amd_range[index].flag & flag)) { - *min = cpu_amd_range[index].min; - *max = cpu_amd_range[index].max; - } - break; - } + if (cpu_reg_range[index].flag == flag) { + *min = cpu_reg_range[index].min; + *max = cpu_reg_range[index].max; + } else + *max = 0; return *max; } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) unsigned msr, msr_min, msr_max; struct cpu_private *priv; u32 low, high; - int i, range; + int i; if (seq) { priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) } } - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) continue; @@ -588,8 +369,20 @@ static void print_apic(void *arg) seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + unsigned int i, v, maxeilvt; + + v = apic_read(APIC_EFEAT); + maxeilvt = (v >> 16) & 0xff; + seq_printf(seq, " EFEAT\t\t: %08x\n", v); + seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + for (i = 0; i < maxeilvt; i++) { + v = apic_read(APIC_EILVTn(i)); + seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); + } + } +#endif /* CONFIG_X86_LOCAL_APIC */ seq_printf(seq, "\n MSR\t:\n"); } @@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) { struct dentry *cpu_dentry = NULL; unsigned reg, reg_min, reg_max; - int i, range, err = 0; + int i, err = 0; char reg_dir[12]; u32 low, high; - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, ®_min, ®_max, i, cpu_base[type].flag)) continue; @@ -850,10 +641,6 @@ static int cpu_init_cpu(void) cpui = &cpu_data(cpu); if (!cpu_has(cpui, X86_FEATURE_MSR)) continue; - per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | - (cpui->x86 << 8) | - (cpui->x86_model)); - per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); sprintf(cpu_dir, "cpu%d", cpu); cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c839875478..f138c6c389b9 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver" + tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" select CPU_FREQ_TABLE - depends on X86_32 + depends on X86_32 && EXPERIMENTAL help - This adds the CPUFreq driver for VIA C7 processors. + This adds the CPUFreq driver for VIA C7 processors. However, this driver + does not have any safeguards to prevent operating the CPU out of spec + and is thus considered dangerous. Please use the regular ACPI cpufreq + driver, enabled by CONFIG_X86_ACPI_CPUFREQ. If in doubt, say N. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 752e8c6b2c7e..ae9b503220ca 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid) { struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return 0; - - return 1; + return cpu_has(cpu, X86_FEATURE_EST); } static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c02..daed39ba2614 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); + int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ @@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } /* Work around errata */ - srat_detect_node(); + srat_detect_node(c); if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e102..789efe217e1a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@ #include <asm/processor.h> #include <asm/smp.h> +#include <asm/k8.h> #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -#endif - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -207,10 +200,17 @@ union l3_cache { }; static const unsigned short __cpuinitconst assocs[] = { - [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, [0xa] = 32, [0xb] = 48, + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, [0xc] = 64, - [0xf] = 0xffff // ?? + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ }; static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; if (leaf == 3) - eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + eax->split.num_threads_sharing = + current_cpu_data.x86_max_cores - 1; else eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; @@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { if (index < 3) return; + + if (boot_cpu_data.x86 == 0x11) + return; + + /* see erratum #382 */ + if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + return; + this_leaf->can_disable = 1; } @@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ - struct pci_dev *dev = NULL; - int i; - - for (i = 0; i <= node; i++) { - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_id[0], dev)); - if (!dev) - break; - } - return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ - return NULL; -} -#endif - -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - ssize_t ret = 0; - int i; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; if (!this_leaf->can_disable) - return sprintf(buf, "Feature not enabled\n"); - - dev = get_k8_northbridge(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; - } - for (i = 0; i < 2; i++) { - unsigned int reg; + if (!dev) + return -EINVAL; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "%x\n", reg); +} - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", - buf, (reg & 0x30000) >> 16, reg & 0xfff); - } - return ret; +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ } +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, - size_t count) +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - unsigned int ret, index, val; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + unsigned int scrubber = 0; if (!this_leaf->can_disable) - return 0; - - if (strlen(buf) > 15) return -EINVAL; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) return -EINVAL; - if (index > 1) + + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; val |= 0xc0000000; - dev = get_k8_northbridge(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); - return -EINVAL; - } + + pci_read_config_dword(dev, 0x58, &scrubber); + scrubber &= ~0x1f000000; + pci_write_config_dword(dev, 0x58, scrubber); pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); + return count; +} - return 1; +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) struct _cache_attr { struct attribute attr; @@ -808,7 +795,10 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); static struct attribute * default_attrs[] = { &type.attr, @@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable.attr, + &cache_disable_0.attr, + &cache_disable_1.attr, NULL }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 09dd1d414fc3..289cc4815028 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) out2: atomic_dec(&mce_entry); } +EXPORT_SYMBOL_GPL(do_machine_check); #ifdef CONFIG_X86_MCE_INTEL /*** diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index cef3ee30744b..65a0fceedcd7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -15,7 +15,6 @@ #include <asm/hw_irq.h> #include <asm/idle.h> #include <asm/therm_throt.h> -#include <asm/apic.h> asmlinkage void smp_thermal_interrupt(void) { diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04f..1d584a18a50d 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d21d4fb161f7..0543f69f0b27 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,9 +20,9 @@ struct fixed_range_block { }; static struct fixed_range_block fixed_range_blocks[] = { - { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ - { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ - { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs) k8_check_syscfg_dram_mod_en(); - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) @@ -310,7 +310,7 @@ void __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; - rdmsr(MTRRcap_MSR, lo, dummy); + rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = (lo >> 8) & 1; for (i = 0; i < num_var_ranges; i++) @@ -318,7 +318,7 @@ void __init get_mtrr_state(void) if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); - rdmsr(MTRRdefType_MSR, lo, dummy); + rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) __flush_tlb(); /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock) __flush_tlb(); /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); @@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i static int generic_have_wrcomb(void) { unsigned long config, dummy; - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c7..8fc248b5aeaf 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) unsigned long config = 0, dummy; if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347a..7538b767f206 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,21 +5,6 @@ #include <linux/types.h> #include <linux/stddef.h> -#define MTRRcap_MSR 0x0fe -#define MTRRdefType_MSR 0x2ff - -#define MTRRfix64K_00000_MSR 0x250 -#define MTRRfix16K_80000_MSR 0x258 -#define MTRRfix16K_A0000_MSR 0x259 -#define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685b..1f5fb1588d1f 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) if (use_intel()) /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 000000000000..895c82e78455 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,1704 @@ +/* + * Performance counter x86 architecture code + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_counter.h> +#include <linux/capability.h> +#include <linux/notifier.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/sched.h> +#include <linux/uaccess.h> + +#include <asm/apic.h> +#include <asm/stacktrace.h> +#include <asm/nmi.h> + +static u64 perf_counter_mask __read_mostly; + +struct cpu_hw_counters { + struct perf_counter *counters[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long interrupts; + int enabled; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(void); + void (*enable)(struct hw_perf_counter *, int); + void (*disable)(struct hw_perf_counter *, int); + unsigned eventsel; + unsigned perfctr; + u64 (*event_map)(int); + u64 (*raw_event)(u64); + int max_events; + int num_counters; + int num_counters_fixed; + int counter_bits; + u64 counter_mask; + u64 max_period; + u64 intel_ctrl; +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { + .enabled = 1, +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static u64 intel_pmu_event_map(int event) +{ + return intel_perfmon_event_map[event]; +} + +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ + CORE_EVNTSEL_COUNTER_MASK) + + return event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_0f_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int event) +{ + return amd_perfmon_event_map[event]; +} + +static u64 amd_pmu_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_COUNTER_MASK) + + return event & K7_EVNTSEL_MASK; +} + +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_counter_update(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + int shift = 64 - x86_pmu.counter_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; + + /* + * Careful: an NMI might modify the previous counter value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic counter atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->counter_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (counter-)time and add that to the generic counter. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static atomic_t active_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + goto eventsel_fail; + } + + return true; + +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(x86_pmu.eventsel + i); + + i = x86_pmu.num_counters; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(x86_pmu.perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +} + +static void release_pmc_hardware(void) +{ + int i; + + for (i = 0; i < x86_pmu.num_counters; i++) { + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_counter_init(struct perf_counter *counter) +{ + struct perf_counter_attr *attr = &counter->attr; + struct hw_perf_counter *hwc = &counter->hw; + int err; + + if (!x86_pmu_initialized()) + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&active_counters); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + /* + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_INT; + + /* + * Count user and OS events unless requested not to. + */ + if (!attr->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!attr->exclude_kernel) + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + + if (!hwc->sample_period) { + hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } + + counter->destroy = hw_perf_counter_destroy; + + /* + * Raw event type provide the config in the event structure + */ + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); + return 0; + } + + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + /* + * The generic map: + */ + hwc->config |= x86_pmu.event_map(attr->config); + + return 0; +} + +static void intel_pmu_disable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int idx; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * counters proper, so that amd_pmu_enable_counter() does the + * right thing. + */ + barrier(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_disable(void) +{ + if (!x86_pmu_initialized()) + return; + return x86_pmu.disable_all(); +} + +static void intel_pmu_enable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static void amd_pmu_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int idx; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + continue; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_enable(void) +{ + if (!x86_pmu_initialized()) + return; + x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + int err; + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + int err; + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + int err; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + x86_pmu_disable_counter(hwc, idx); +} + +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static int +x86_perf_counter_set_period(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int err, ret = 0; + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + /* + * Quirk: certain CPUs dont like it if just 1 event is left: + */ + if (unlikely(left < 2)) + left = 2; + + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + + per_cpu(prev_left[idx], smp_processor_id()) = left; + + /* + * The hw counter starts counting from this counter offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)-left); + + err = checking_wrmsrl(hwc->counter_base + idx, + (u64)(-left) & x86_pmu.counter_mask); + + return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); + else + x86_pmu_disable_counter(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +{ + unsigned int event; + + if (!x86_pmu.num_counters_fixed) + return -1; + + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + + return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ +static int x86_pmu_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx; + + idx = fixed_mode_idx(counter, hwc); + if (idx >= 0) { + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->counter_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic counter again */ + if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: + idx = find_first_zero_bit(cpuc->used_mask, + x86_pmu.num_counters); + if (idx == x86_pmu.num_counters) + return -EAGAIN; + + set_bit(idx, cpuc->used_mask); + hwc->idx = idx; + } + hwc->config_base = x86_pmu.eventsel; + hwc->counter_base = x86_pmu.perfctr; + } + + perf_counters_lapic_init(); + + x86_pmu.disable(hwc, idx); + + cpuc->counters[idx] = counter; + set_bit(idx, cpuc->active_mask); + + x86_perf_counter_set_period(counter, hwc, idx); + x86_pmu.enable(hwc, idx); + + return 0; +} + +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->counters[hwc->idx] != counter)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + +void perf_counter_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + struct cpu_hw_counters *cpuc; + unsigned long flags; + int cpu, idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + if (x86_pmu.version >= 2) { + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + } + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); + + prev_left = per_cpu(prev_left[idx], cpu); + + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", + cpu, idx, prev_left); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } + local_irq_restore(flags); +} + +static void x86_pmu_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active_mask); + x86_pmu.disable(hwc, idx); + + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the counter: + */ + barrier(); + + /* + * Drain the remaining delta count out of a counter + * that we are disabling: + */ + x86_perf_counter_update(counter, hwc, idx); + cpuc->counters[idx] = NULL; + clear_bit(idx, cpuc->used_mask); +} + +/* + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: + */ +static int intel_pmu_save_and_restart(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + int ret; + + x86_perf_counter_update(counter, hwc, idx); + ret = x86_perf_counter_set_period(counter, hwc, idx); + + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + intel_pmu_enable_counter(hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + + local_irq_restore(flags); +} + + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + int bit, cpu, loops; + u64 ack, status; + + data.regs = regs; + data.addr = 0; + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + perf_disable(); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + perf_counter_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_counter *counter = cpuc->counters[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(counter)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + intel_pmu_disable_counter(&counter->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int cpu, idx, handled = 0; + u64 val; + + data.regs = regs; + data.addr = 0; + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + counter = cpuc->counters[idx]; + hwc = &counter->hw; + + val = x86_perf_counter_update(counter, hwc, idx); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + amd_pmu_disable_counter(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_counter_do_pending(); + irq_exit(); +} + +void set_perf_counter_pending(void) +{ + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + +void perf_counters_lapic_init(void) +{ + if (!x86_pmu_initialized()) + return; + + /* + * Always use NMI for PMU + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (!atomic_read(&active_counters)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + apic_write(APIC_LVTPC, APIC_DM_NMI); + /* + * Can't rely on the handled return value to say it was our NMI, two + * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { + .notifier_call = perf_counter_nmi_handler, + .next = NULL, + .priority = 1 +}; + +static struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_counter, + .disable = intel_pmu_disable_counter, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + .max_period = (1ULL << 31) - 1, +}; + +static struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_counter, + .disable = amd_pmu_disable_counter, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 4, + .counter_bits = 48, + .counter_mask = (1ULL << 48) - 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, +}; + +static int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return -ENODEV; + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired Event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + return -ENODEV; + + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose counters, so + * assume at least 3 counters: + */ + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Core2 events, "); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Atom events, "); + break; + } + return 0; +} + +static int amd_pmu_init(void) +{ + x86_pmu = amd_pmu; + + switch (boot_cpu_data.x86) { + case 0x0f: + case 0x10: + case 0x11: + memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("AMD Family 0f/10/11 events, "); + break; + } + return 0; +} + +void __init init_hw_perf_counters(void) +{ + int err; + + pr_info("Performance Counters: "); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + err = intel_pmu_init(); + break; + case X86_VENDOR_AMD: + err = amd_pmu_init(); + break; + default: + return; + } + if (err != 0) { + pr_cont("no PMU driver, software counters only.\n"); + return; + } + + pr_cont("%s PMU driver.\n", x86_pmu.name); + + if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", + x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + } + perf_counter_mask = (1 << x86_pmu.num_counters) - 1; + perf_max_counters = x86_pmu.num_counters; + + if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + } + + perf_counter_mask |= + ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + + perf_counters_lapic_init(); + register_die_notifier(&perf_counter_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... generic counters: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); +} + +static inline void x86_pmu_read(struct perf_counter *counter) +{ + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ + int err; + + err = __hw_perf_counter_init(counter); + if (err) + return ERR_PTR(err); + + return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +{ + if (entry->nr < MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + unsigned long bp; + char *stack; + int nr = entry->nr; + + callchain_store(entry, instruction_pointer(regs)); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = frame_pointer(regs); +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + + entry->kernel = entry->nr - nr; +} + + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + int nr = entry->nr; + + regs = (struct pt_regs *)current->thread.sp0 - 1; + fp = (void __user *)regs->bp; + + callchain_store(entry, regs->ip); + + while (entry->nr < MAX_STACK_DEPTH) { + frame.next_fp = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < user_stack_pointer(regs)) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_fp; + } + + entry->user = entry->nr - nr; +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(nmi_entry); + else + entry = &__get_cpu_var(irq_entry); + + entry->nr = 0; + entry->hv = 0; + entry->kernel = 0; + entry->user = 0; + + perf_do_callchain(regs, entry); + + return entry; +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f6c70a164e32..d6f5b9fbde32 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -19,8 +19,8 @@ #include <linux/nmi.h> #include <linux/kprobes.h> -#include <asm/genapic.h> -#include <asm/intel_arch_perfmon.h> +#include <asm/apic.h> +#include <asm/perf_counter.h> struct nmi_watchdog_ctlblk { unsigned int cccr_msr; diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3a765a..48bfe1386038 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,45 +19,61 @@ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 */ - -#include <asm/ds.h> - -#include <linux/errno.h> +#include <linux/kernel.h> #include <linux/string.h> -#include <linux/slab.h> +#include <linux/errno.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/mm.h> -#include <linux/kernel.h> +#include <linux/trace_clock.h> + +#include <asm/ds.h> +#include "ds_selftest.h" /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation: */ struct ds_configuration { - /* the name of the configuration */ - const char *name; - /* the size of one pointer-typed field in the DS structure and - in the BTS and PEBS buffers in bytes; - this covers the first 8 DS fields related to buffer management. */ - unsigned char sizeof_field; - /* the size of a BTS/PEBS record in bytes */ - unsigned char sizeof_rec[2]; - /* a series of bit-masks to control various features indexed - * by enum ds_feature */ - unsigned long ctl[dsf_ctl_max]; + /* The name of the configuration: */ + const char *name; + + /* The size of pointer-typed fields in DS, BTS, and PEBS: */ + unsigned char sizeof_ptr_field; + + /* The size of a BTS/PEBS record in bytes: */ + unsigned char sizeof_rec[2]; + + /* The number of pebs counter reset values in the DS structure. */ + unsigned char nr_counter_reset; + + /* Control bit-masks indexed by enum ds_feature: */ + unsigned long ctl[dsf_ctl_max]; }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; + + +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS 0x80 -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS (3 * 8) -#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT (1 << 3) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ - ds_cfg.ctl[dsf_bts_overflow]) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS 8 +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE 8 + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL \ + ( ds_cfg.ctl[dsf_bts] | \ + ds_cfg.ctl[dsf_bts_kernel] | \ + ds_cfg.ctl[dsf_bts_user] | \ + ds_cfg.ctl[dsf_bts_overflow] ) /* * A BTS or PEBS tracer. @@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); * to identify tracers. */ struct ds_tracer { - /* the DS context (partially) owned by this tracer */ - struct ds_context *context; - /* the buffer provided on ds_request() and its size in bytes */ - void *buffer; - size_t size; + /* The DS context (partially) owned by this tracer. */ + struct ds_context *context; + /* The buffer provided on ds_request() and its size in bytes. */ + void *buffer; + size_t size; }; struct bts_tracer { - /* the common DS part */ - struct ds_tracer ds; - /* the trace including the DS configuration */ - struct bts_trace trace; - /* buffer overflow notification function */ - bts_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct bts_trace trace; + + /* Buffer overflow notification function: */ + bts_ovfl_callback_t ovfl; + + /* Active flags affecting trace collection. */ + unsigned int flags; }; struct pebs_tracer { - /* the common DS part */ - struct ds_tracer ds; - /* the trace including the DS configuration */ - struct pebs_trace trace; - /* buffer overflow notification function */ - pebs_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct pebs_trace trace; + + /* Buffer overflow notification function: */ + pebs_ovfl_callback_t ovfl; }; /* @@ -97,6 +120,7 @@ struct pebs_tracer { * * The DS configuration consists of the following fields; different * architetures vary in the size of those fields. + * * - double-word aligned base linear address of the BTS buffer * - write pointer into the BTS buffer * - end linear address of the BTS buffer (one byte beyond the end of @@ -135,21 +159,22 @@ enum ds_field { }; enum ds_qualifier { - ds_bts = 0, + ds_bts = 0, ds_pebs }; -static inline unsigned long ds_get(const unsigned char *base, - enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, - enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, + unsigned long value) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; } @@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, */ static DEFINE_SPINLOCK(ds_lock); - /* * We either support (system-wide) per-cpu or per-thread allocation. * We distinguish the two based on the task_struct pointer, where a @@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock); */ static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task) { - if (task) + int error; + + spin_lock_irq(&ds_lock); + + if (task) { + error = -EPERM; + if (atomic_read(&tracers) < 0) + goto out; atomic_inc(&tracers); - else + } else { + error = -EPERM; + if (atomic_read(&tracers) > 0) + goto out; atomic_dec(&tracers); + } + + error = 0; +out: + spin_unlock_irq(&ds_lock); + return error; } static inline void put_tracer(struct task_struct *task) @@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task) atomic_inc(&tracers); } -static inline int check_tracer(struct task_struct *task) -{ - return task ? - (atomic_read(&tracers) >= 0) : - (atomic_read(&tracers) <= 0); -} - - /* * The DS context is either attached to a thread or to a cpu: * - in the former case, the thread_struct contains a pointer to the @@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ - struct bts_tracer *bts_master; - struct pebs_tracer *pebs_master; - /* use count */ - unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ - struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ - struct task_struct *task; -}; + /* The DS configuration; goes into MSR_IA32_DS_AREA: */ + unsigned char ds[MAX_SIZEOF_DS]; + + /* The owner of the BTS and PEBS configuration, respectively: */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); + /* Use count: */ + unsigned long count; -#define system_context per_cpu(system_context_array, smp_processor_id()) + /* Pointer to the context pointer field: */ + struct ds_context **this; + + /* The traced task; NULL for cpu tracing: */ + struct task_struct *task; + + /* The traced cpu; only valid if task is NULL: */ + int cpu; +}; +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) + +static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &system_context); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; - unsigned long irq; /* Chances are small that we already have a context. */ new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); if (!new_context) return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irq(&ds_lock); context = *p_context; - if (!context) { + if (likely(!context)) { context = new_context; context->this = p_context; context->task = task; + context->cpu = cpu; context->count = 0; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - *p_context = context; } context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); if (context != new_context) kfree(new_context); @@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) return context; } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context) { + struct task_struct *task; unsigned long irq; if (!context) @@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context) *(context->this) = NULL; - if (context->task) - clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + task = context->task; + + if (task) + clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!context->task || (context->task == current)) - wrmsrl(MSR_IA32_DS_AREA, 0); + /* + * We leave the (now dangling) pointer to the DS configuration in + * the DS_AREA msr. This is as good or as bad as replacing it with + * NULL - the hardware would crash if we enabled tracing. + * + * This saves us some problems with having to write an msr on a + * different cpu while preventing others from doing the same for the + * next context for that same cpu. + */ spin_unlock_irqrestore(&ds_lock, irq); + /* The context might still be in use for context switching. */ + if (task && (task != current)) + wait_task_context_switch(task); + kfree(context); } +static void ds_install_ds_area(struct ds_context *context) +{ + unsigned long ds; + + ds = (unsigned long)context->ds; + + /* + * There is a race between the bts master and the pebs master. + * + * The thread/cpu access is synchronized via get/put_cpu() for + * task tracing and via wrmsr_on_cpu for cpu tracing. + * + * If bts and pebs are collected for the same task or same cpu, + * the same confiuration is written twice. + */ + if (context->task) { + get_cpu(); + if (context->task == current) + wrmsrl(MSR_IA32_DS_AREA, ds); + set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + put_cpu(); + } else + wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, + (u32)((u64)ds), (u32)((u64)ds >> 32)); +} /* * Call the tracer's callback on a buffer overflow. @@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) * The remainder of any partially written record is zeroed out. * * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual: the buffer type + * record: the data to write + * size: the size of the data */ static int ds_write(struct ds_context *context, enum ds_qualifier qual, const void *record, size_t size) @@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, unsigned long write_size, adj_write_size; /* - * write as much as possible without producing an + * Write as much as possible without producing an * overflow interrupt. * - * interrupt_threshold must either be + * Interrupt_threshold must either be * - bigger than absolute_maximum or * - point to a record between buffer_base and absolute_maximum * - * index points to a valid record. + * Index points to a valid record. */ base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); @@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, write_end = min(end, int_th); - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ + /* + * If we are already beyond the interrupt threshold, + * we fill the entire buffer. + */ if (write_end <= index) write_end = end; @@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; - /* zero out trailing bytes */ + /* Zero out trailing bytes. */ memset((char *)index + write_size, 0, adj_write_size - write_size); index += adj_write_size; @@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, * Later architectures use 64bit pointers throughout, whereas earlier * architectures use 32bit pointers in 32bit mode. * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on: * - the field size stored in the DS configuration * - the relative field position * @@ -431,23 +501,23 @@ enum bts_field { bts_to, bts_flags, - bts_qual = bts_from, - bts_jiffies = bts_to, - bts_pid = bts_flags, + bts_qual = bts_from, + bts_clock = bts_to, + bts_pid = bts_flags, - bts_qual_mask = (bts_qual_max - 1), - bts_escape = ((unsigned long)-1 & ~bts_qual_mask) + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; static inline unsigned long bts_get(const char *base, enum bts_field field) { - base += (ds_cfg.sizeof_field * field); + base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } static inline void bts_set(char *base, enum bts_field field, unsigned long val) { - base += (ds_cfg.sizeof_field * field);; + base += (ds_cfg.sizeof_ptr_field * field);; (*(unsigned long *)base) = val; } @@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val) * * return: bytes read/written on success; -Eerrno, otherwise */ -static int bts_read(struct bts_tracer *tracer, const void *at, - struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) { if (!tracer) return -EINVAL; @@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at, memset(out, 0, sizeof(*out)); if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); - out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); - out->variant.timestamp.pid = bts_get(at, bts_pid); + out->variant.event.clock = bts_get(at, bts_clock); + out->variant.event.pid = bts_get(at, bts_pid); } else { out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); @@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) case bts_task_arrives: case bts_task_departs: bts_set(raw, bts_qual, (bts_escape | in->qualifier)); - bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); - bts_set(raw, bts_pid, in->variant.timestamp.pid); + bts_set(raw, bts_clock, in->variant.event.clock); + bts_set(raw, bts_pid, in->variant.event.pid); break; default: return -EINVAL; @@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, unsigned int flags) { unsigned long buffer, adj; - /* adjust the buffer address and size to meet alignment + /* + * Adjust the buffer address and size to meet alignment * constraints: * - buffer is double-word aligned * - size is multiple of record size @@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, trace->begin = (void *)buffer; trace->top = trace->begin; trace->end = (void *)(buffer + size); - /* The value for 'no threshold' is -1, which will set the + /* + * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ + ith *= ds_cfg.sizeof_rec[qual]; trace->ith = (void *)(buffer + size - ith); trace->flags = flags; @@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, enum ds_qualifier qual, struct task_struct *task, - void *base, size_t size, size_t th, unsigned int flags) + int cpu, void *base, size_t size, size_t th) { struct ds_context *context; int error; + size_t req_size; + + error = -EOPNOTSUPP; + if (!ds_cfg.sizeof_rec[qual]) + goto out; error = -EINVAL; if (!base) goto out; - /* we require some space to do alignment adjustments below */ + req_size = ds_cfg.sizeof_rec[qual]; + /* We might need space for alignment adjustments. */ + if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) + req_size += DS_ALIGNMENT; + error = -EINVAL; - if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + if (size < req_size) goto out; if (th != (size_t)-1) { @@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, tracer->size = size; error = -ENOMEM; - context = ds_get_context(task); + context = ds_get_context(task, cpu); if (!context) goto out; tracer->context = context; - ds_init_ds_trace(trace, qual, base, size, th, flags); + /* + * Defer any tracer-specific initialization work for the context until + * context ownership has been clarified. + */ error = 0; out: return error; } -struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; - unsigned long irq; int error; + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; - if (!ds_cfg.ctl[dsf_bts]) + if (ovfl) goto out; - /* buffer overflow notification is not yet implemented */ - error = -EOPNOTSUPP; - if (ovfl) + error = get_tracer(task); + if (error < 0) goto out; error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_bts, task, base, size, th, flags); + ds_bts, task, cpu, base, size, th); if (error < 0) goto out_tracer; - - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); + /* Claim the bts part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->bts_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->bts_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the bts part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_install_ds_area(tracer->ds.context); tracer->trace.read = bts_read; tracer->trace.write = bts_write; - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + /* Start tracing. */ ds_resume_bts(tracer); return tracer; - out_put_tracer: - put_tracer(task); out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; - unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_pebs, task, base, size, th, flags); + ds_pebs, task, cpu, base, size, th); if (error < 0) goto out_tracer; - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); + /* Claim the pebs part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->pebs_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->pebs_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the pebs part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + ds_install_ds_area(tracer->ds.context); + + /* Start tracing. */ ds_resume_pebs(tracer); return tracer; - out_put_tracer: - put_tracer(task); out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) { - if (!tracer) - return; + return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} - ds_suspend_bts(tracer); +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} + +static void ds_free_bts(struct bts_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; - put_tracer(tracer->ds.context->task); + /* Make sure tracing stopped and the tracer is not in use. */ + if (task && (task != current)) + wait_task_context_switch(task); + ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } +void ds_release_bts(struct bts_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_bts(tracer); + ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_bts_noirq(tracer); + ds_free_bts(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, + unsigned long debugctlmsr) +{ + task->thread.debugctlmsr = debugctlmsr; + + get_cpu(); + if (task == current) + update_debugctlmsr(debugctlmsr); + put_cpu(); +} + void ds_suspend_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; + tracer->flags = 0; + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + WARN_ON(!task && irqs_disabled()); - if (task) { - task->thread.debugctlmsr &= ~BTS_CONTROL; + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr &= ~BTS_CONTROL; - if (!task->thread.debugctlmsr) - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer) { struct task_struct *task; - unsigned long control; + unsigned long debugctlmsr, irq; + int cpu, error = 0; if (!tracer) - return; + return 0; + + tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr &= ~BTS_CONTROL; + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ + unsigned long control; control = ds_cfg.ctl[dsf_bts]; if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!(tracer->trace.ds.flags & BTS_USER)) control |= ds_cfg.ctl[dsf_bts_user]; - if (task) { - task->thread.debugctlmsr |= control; - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } - - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() | control); + return control; } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer) { + struct task_struct *task; + unsigned long debugctlmsr; + int cpu; + if (!tracer) return; - ds_suspend_pebs(tracer); + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + WARN_ON(!task && irqs_disabled()); + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long debugctlmsr, irq; + int cpu, error = 0; + + if (!tracer) + return 0; + + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; - put_tracer(tracer->ds.context->task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } +void ds_release_pebs(struct pebs_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_pebs(tracer); + ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_pebs_noirq(tracer); + ds_free_pebs(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + void ds_suspend_pebs(struct pebs_tracer *tracer) { } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + void ds_resume_pebs(struct pebs_tracer *tracer) { } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) @@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) return NULL; ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + + tracer->trace.counters = ds_cfg.nr_counter_reset; + memcpy(tracer->trace.counter_reset, + tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), + ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); return &tracer->trace; } @@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer) tracer->trace.ds.top = tracer->trace.ds.begin; - ds_set(tracer->ds.context->ds, ds_bts, ds_index, + ds_set(tracer->ds.context->ds, ds_pebs, ds_index, (unsigned long)tracer->trace.ds.top); return 0; } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value) { if (!tracer) return -EINVAL; - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + if (ds_cfg.nr_counter_reset < counter) + return -EINVAL; + + *(u64 *)(tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + + (counter * PEBS_RESET_FIELD_SIZE)) = value; return 0; } @@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), - - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, - .sizeof_rec[ds_pebs] = 8 * 18, + .nr_counter_reset = 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { + .name = "Core i7", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 4, }; static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, + struct cpuinfo_x86 *cpu) { + unsigned long nr_pebs_fields = 0; + + printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ + nr_pebs_fields = 10; +#else + nr_pebs_fields = 18; +#endif + + /* + * Starting with version 2, architectural performance + * monitoring supports a format specifier. + */ + if ((cpuid_eax(0xa) & 0xff) > 1) { + unsigned long perf_capabilities, format; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + + format = (perf_capabilities >> 8) & 0xf; + + switch (format) { + case 0: + nr_pebs_fields = 18; + break; + case 1: + nr_pebs_fields = 22; + break; + default: + printk(KERN_INFO + "[ds] unknown PEBS format: %lu\n", format); + nr_pebs_fields = 0; + break; + } + } + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + ds_cfg.sizeof_ptr_field = + (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); + + ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; + ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; - if (!cpu_has_bts) { - ds_cfg.ctl[dsf_bts] = 0; + if (!cpu_has(cpu, X86_FEATURE_BTS)) { + ds_cfg.sizeof_rec[ds_bts] = 0; printk(KERN_INFO "[ds] bts not available\n"); } - if (!cpu_has_pebs) + if (!cpu_has(cpu, X86_FEATURE_PEBS)) { + ds_cfg.sizeof_rec[ds_pebs] = 0; printk(KERN_INFO "[ds] pebs not available\n"); + } + + printk(KERN_INFO "[ds] sizes: address: %u bit, ", + 8 * ds_cfg.sizeof_ptr_field); + printk("bts/pebs record: %u/%u bytes\n", + ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); + WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) { + /* Only configure the first cpu. Others are identical. */ + if (ds_cfg.name) + return; + switch (c->x86) { case 0x6: switch (c->x86_model) { case 0x9: case 0xd: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_pentium_m, c); break; case 0xf: case 0x17: /* Core2 */ case 0x1c: /* Atom */ - ds_configure(&ds_cfg_core2_atom); + ds_configure(&ds_cfg_core2_atom, c); + break; + case 0x1a: /* Core i7 */ + ds_configure(&ds_cfg_core_i7, c); break; - case 0x1a: /* i7 */ default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; @@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_netburst, c); break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } } +static inline void ds_take_timestamp(struct ds_context *context, + enum bts_qualifier qualifier, + struct task_struct *task) +{ + struct bts_tracer *tracer = context->bts_master; + struct bts_struct ts; + + /* Prevent compilers from reading the tracer pointer twice. */ + barrier(); + + if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) + return; + + memset(&ts, 0, sizeof(ts)); + ts.qualifier = qualifier; + ts.variant.event.clock = trace_clock_global(); + ts.variant.event.pid = task->pid; + + bts_write(tracer, &ts); +} + /* * Change the DS configuration from tracing prev to tracing next. */ void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - struct ds_context *prev_ctx = prev->thread.ds_ctx; - struct ds_context *next_ctx = next->thread.ds_ctx; + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + unsigned long debugctlmsr = next->thread.debugctlmsr; + + /* Make sure all data is read before we start. */ + barrier(); if (prev_ctx) { update_debugctlmsr(0); - if (prev_ctx->bts_master && - (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_departs, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = prev->pid - }; - bts_write(prev_ctx->bts_master, &ts); - } + ds_take_timestamp(prev_ctx, bts_task_departs, prev); } if (next_ctx) { - if (next_ctx->bts_master && - (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_arrives, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = next->pid - }; - bts_write(next_ctx->bts_master, &ts); - } + ds_take_timestamp(next_ctx, bts_task_arrives, next); wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } - update_debugctlmsr(next->thread.debugctlmsr); + update_debugctlmsr(debugctlmsr); } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +static __init int ds_selftest(void) { - clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); - tsk->thread.ds_ctx = NULL; -} + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; -void ds_exit_thread(struct task_struct *tsk) -{ + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + + return 0; } +device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c new file mode 100644 index 000000000000..6bc7c199ab99 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.c @@ -0,0 +1,408 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2009 + */ + +#include "ds_selftest.h" + +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <asm/ds.h> + + +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ + +struct ds_selftest_bts_conf { + struct bts_tracer *tracer; + int error; + int (*suspend)(struct bts_tracer *); + int (*resume)(struct bts_tracer *); +}; + +static int ds_selftest_bts_consistency(const struct bts_trace *trace) +{ + int error = 0; + + if (!trace) { + printk(KERN_CONT "failed to access trace..."); + /* Bail out. Other tests are pointless. */ + return -1; + } + + if (!trace->read) { + printk(KERN_CONT "bts read not available..."); + error = -1; + } + + /* Do some sanity checks on the trace configuration. */ + if (!trace->ds.n) { + printk(KERN_CONT "empty bts buffer..."); + error = -1; + } + if (!trace->ds.size) { + printk(KERN_CONT "bad bts trace setup..."); + error = -1; + } + if (trace->ds.end != + (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { + printk(KERN_CONT "bad bts buffer setup..."); + error = -1; + } + /* + * We allow top in [begin; end], since its not clear when the + * overflow adjustment happens: after the increment or before the + * write. + */ + if ((trace->ds.top < trace->ds.begin) || + (trace->ds.end < trace->ds.top)) { + printk(KERN_CONT "bts top out of bounds..."); + error = -1; + } + + return error; +} + +static int ds_selftest_bts_read(struct bts_tracer *tracer, + const struct bts_trace *trace, + const void *from, const void *to) +{ + const unsigned char *at; + + /* + * Check a few things which do not belong to this test. + * They should be covered by other tests. + */ + if (!trace) + return -1; + + if (!trace->read) + return -1; + + if (to < from) + return -1; + + if (from < trace->ds.begin) + return -1; + + if (trace->ds.end < to) + return -1; + + if (!trace->ds.size) + return -1; + + /* Now to the test itself. */ + for (at = from; (void *)at < to; at += trace->ds.size) { + struct bts_struct bts; + unsigned long index; + int error; + + if (((void *)at - trace->ds.begin) % trace->ds.size) { + printk(KERN_CONT + "read from non-integer index..."); + return -1; + } + index = ((void *)at - trace->ds.begin) / trace->ds.size; + + memset(&bts, 0, sizeof(bts)); + error = trace->read(tracer, at, &bts); + if (error < 0) { + printk(KERN_CONT + "error reading bts trace at [%lu] (0x%p)...", + index, at); + return error; + } + + switch (bts.qualifier) { + case BTS_BRANCH: + break; + default: + printk(KERN_CONT + "unexpected bts entry %llu at [%lu] (0x%p)...", + bts.qualifier, index, at); + return -1; + } + } + + return 0; +} + +static void ds_selftest_bts_cpu(void *arg) +{ + struct ds_selftest_bts_conf *conf = arg; + const struct bts_trace *trace; + void *top; + + if (IS_ERR(conf->tracer)) { + conf->error = PTR_ERR(conf->tracer); + conf->tracer = NULL; + + printk(KERN_CONT + "initialization failed (err: %d)...", conf->error); + return; + } + + /* We should meanwhile have enough trace. */ + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; + + /* Let's see if we can access the trace. */ + trace = ds_read_bts(conf->tracer); + + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; + + /* If everything went well, we should have a few trace entries. */ + if (trace->ds.top == trace->ds.begin) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning, but continue. + */ + printk(KERN_CONT "no trace/overflow..."); + } + + /* Let's try to read the trace we collected. */ + conf->error = + ds_selftest_bts_read(conf->tracer, trace, + trace->ds.begin, trace->ds.top); + if (conf->error < 0) + return; + + /* + * Let's read the trace again. + * Since we suspended tracing, we should get the same result. + */ + top = trace->ds.top; + + trace = ds_read_bts(conf->tracer); + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; + + if (top != trace->ds.top) { + printk(KERN_CONT "suspend not working..."); + conf->error = -1; + return; + } + + /* Let's collect some more trace - see if resume is working. */ + conf->error = conf->resume(conf->tracer); + if (conf->error < 0) + return; + + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; + + trace = ds_read_bts(conf->tracer); + + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; + + if (trace->ds.top == top) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning and check the full trace. + */ + printk(KERN_CONT + "no resume progress/overflow..."); + + conf->error = + ds_selftest_bts_read(conf->tracer, trace, + trace->ds.begin, trace->ds.end); + } else if (trace->ds.top < top) { + /* + * We had a buffer overflow - the entire buffer should + * contain trace records. + */ + conf->error = + ds_selftest_bts_read(conf->tracer, trace, + trace->ds.begin, trace->ds.end); + } else { + /* + * It is quite likely that the buffer did not overflow. + * Let's just check the delta trace. + */ + conf->error = + ds_selftest_bts_read(conf->tracer, trace, top, + trace->ds.top); + } + if (conf->error < 0) + return; + + conf->error = 0; +} + +static int ds_suspend_bts_wrap(struct bts_tracer *tracer) +{ + ds_suspend_bts(tracer); + return 0; +} + +static int ds_resume_bts_wrap(struct bts_tracer *tracer) +{ + ds_resume_bts(tracer); + return 0; +} + +static void ds_release_bts_noirq_wrap(void *tracer) +{ + (void)ds_release_bts_noirq(tracer); +} + +static int ds_selftest_bts_bad_release_noirq(int cpu, + struct bts_tracer *tracer) +{ + int error = -EPERM; + + /* Try to release the tracer on the wrong cpu. */ + get_cpu(); + if (cpu != smp_processor_id()) { + error = ds_release_bts_noirq(tracer); + if (error != -EPERM) + printk(KERN_CONT "release on wrong cpu..."); + } + put_cpu(); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + ds_release_bts(tracer); + error = 0; + } + + if (error != -EPERM) + printk(KERN_CONT "cpu/task tracing overlap..."); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_task(void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + error = 0; + ds_release_bts(tracer); + } + + if (error != -EPERM) + printk(KERN_CONT "task/cpu tracing overlap..."); + + return error ? 0 : -1; +} + +int ds_selftest_bts(void) +{ + struct ds_selftest_bts_conf conf; + unsigned char buffer[BUFFER_SIZE], *small_buffer; + unsigned long irq; + int cpu; + + printk(KERN_INFO "[ds] bts selftest..."); + conf.error = 0; + + small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; + + get_online_cpus(); + for_each_online_cpu(cpu) { + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); + if (conf.error >= 0) { + conf.error = + ds_selftest_bts_bad_release_noirq(cpu, + conf.tracer); + /* We must not release the tracer twice. */ + if (conf.error < 0) + conf.tracer = NULL; + } + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); + smp_call_function_single(cpu, ds_release_bts_noirq_wrap, + conf.tracer, 1); + if (conf.error < 0) + goto out; + } + + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + local_irq_save(irq); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts_noirq(conf.tracer); + local_irq_restore(irq); + if (conf.error < 0) + goto out; + + conf.error = 0; + out: + put_online_cpus(); + printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); + + return conf.error; +} + +int ds_selftest_pebs(void) +{ + return 0; +} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h new file mode 100644 index 000000000000..2ba8745c6663 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.h @@ -0,0 +1,15 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2009 + */ + +#ifdef CONFIG_X86_DS_SELFTEST +extern int ds_selftest_bts(void); +extern int ds_selftest_pebs(void); +#else +static inline int ds_selftest_bts(void) { return 0; } +static inline int ds_selftest_pebs(void) { return 0; } +#endif diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b8698..81086c227ab7 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; -extern int kstack_depth_to_print; /* The form of the top of the frame on the stack */ struct stack_frame { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 006281302925..7271fa33d791 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -635,14 +635,9 @@ __init void e820_setup_gap(void) #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", @@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + void __init e820_reserve_resources_late(void) { int i; @@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void) insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + resource_size_t start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)); + if (start == end) + continue; + reserve_region_with_split(&iomem_resource, start, + end - 1, "RAM buffer"); + } } char *__init default_machine_specific_memory_setup(void) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953dee..ebdb85cf2686 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func) } #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } +#endif static void __init ati_bugs(int num, int slot, int func) { diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e8433..a4742a340d8d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -147,27 +147,14 @@ END(ftrace_graph_caller) GLOBAL(return_to_handler) subq $80, %rsp + /* Save the return values */ movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - movq %r10, 56(%rsp) - movq %r11, 64(%rsp) + movq %rdx, 8(%rsp) call ftrace_return_to_handler movq %rax, 72(%rsp) - movq 64(%rsp), %r11 - movq 56(%rsp), %r10 - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx + movq 8(%rsp), %rdx movq (%rsp), %rax addq $72, %rsp retq @@ -1025,6 +1012,11 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt +#endif + /* * Exception entry points. */ @@ -1379,6 +1371,11 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0cd..dc5ed4bdd88d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -608,13 +608,6 @@ ignore_int: ENTRY(initial_code) .long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - /* * BSS section */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c8..38287b5f116e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include <asm/io_apic.h> #include <asm/irq.h> #include <asm/idle.h> +#include <asm/hw_irq.h> atomic_t irq_err_count; @@ -24,9 +25,9 @@ void (*generic_interrupt_extension)(void) = NULL; */ void ack_bad_irq(unsigned int irq) { - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N @@ -36,9 +37,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -63,6 +62,14 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "CNT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, "%*s: ", prec, "PND"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); #endif if (generic_interrupt_extension) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -166,6 +173,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; #endif if (generic_interrupt_extension) sum += irq_stats(cpu)->generic_irqs; @@ -178,7 +187,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_thermal_count; # ifdef CONFIG_X86_64 sum += irq_stats(cpu)->irq_threshold_count; -#endif +# endif #endif return sum; } @@ -213,14 +222,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) irq = __get_cpu_var(vector_irq)[vector]; if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 - if (!disable_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); } irq_exit(); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c index 368b0a8836f9..267c6624c77f 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit.c @@ -1,20 +1,25 @@ +#include <linux/linkage.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/timex.h> #include <linux/slab.h> #include <linux/random.h> +#include <linux/kprobes.h> #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/bitops.h> +#include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> #include <asm/atomic.h> #include <asm/system.h> #include <asm/timer.h> +#include <asm/hw_irq.h> #include <asm/pgtable.h> #include <asm/desc.h> #include <asm/apic.h> @@ -22,7 +27,23 @@ #include <asm/i8259.h> #include <asm/traps.h> +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ +#ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 * as the irq is unreliable, and exception 16 works correctly @@ -52,30 +73,7 @@ static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", }; - -void __init init_ISA_irqs(void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); #endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} /* * IRQ2 is cascade interrupt to second interrupt controller @@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init init_ISA_irqs(void) { int i; - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif + init_8259A(0); /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) + * 16 old-style INTA-cycle interrupts: */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); } +} +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) +static void __init smp_intr_init(void) +{ +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -160,16 +166,32 @@ void __init native_init_IRQ(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI for single call function */ + /* IPI for generic single function call */ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); + +#ifdef CONFIG_X86_THERMAL_VECTOR + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#ifdef CONFIG_X86_THRESHOLD + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif +#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) + alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); +#endif -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -179,16 +201,59 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupts: */ +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); +# endif + #endif +} -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) - /* thermal monitor LVT interrupt */ - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ +#ifdef CONFIG_X86_32 + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } #endif + init_ISA_irqs(); +} + +void __init native_init_IRQ(void) +{ + int i; + + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ + if (!test_bit(i, used_vectors)) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + } if (!acpi_ioapic) setup_irq(2, &irq2); +#ifdef CONFIG_X86_32 /* * Call quirks after call gates are initialised (usually add in * the architecture specific gates): @@ -203,4 +268,5 @@ void __init native_init_IRQ(void) setup_irq(FPU_IRQ, &fpu_irq); irq_ctx_init(smp_processor_id()); +#endif } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index 8cd10537fd46..000000000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null @@ -1,177 +0,0 @@ -#include <linux/linkage.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/i8259.h> - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -} - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919e..8d82a77a3f3b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019ddb56b4..a78ecad0c900 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hardirq.h> +#include <asm/timer.h> #define MMU_QUEUE_SIZE 1024 @@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void) struct kvm_para_state *state = kvm_para_state(); mmu_queue_flush(state); - paravirt_leave_lazy(paravirt_get_lazy_mode()); + paravirt_leave_lazy_mmu(); state->mode = paravirt_get_lazy_mode(); } @@ -230,6 +231,9 @@ static void paravirt_ops_setup(void) pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; } +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c6..366baa179913 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h> #include <linux/firmware.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> #include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h> #include <linux/pci.h> -#include <linux/fs.h> -#include <linux/mm.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -79,9 +67,6 @@ struct microcode_amd { #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); - static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { - unsigned long flags; u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) BUG_ON(cpu_num != cpu); if (mc_amd == NULL) - return; + return 0; - spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); - return; + return -1; } printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; + + return 0; } static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf) static void free_equiv_cpu_table(void) { - if (equiv_cpu_table) { - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; - } + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: failed to create " "equivalent cpu table\n"); - return -EINVAL; + return UCODE_ERROR; } ucode_ptr += offset; @@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else @@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) if (new_mc) { if (!leftover) { - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - } else + } else { vfree(new_mc); - } + state = UCODE_ERROR; + } + } else + state = UCODE_NFOUND; free_equiv_cpu_table(); - return (int)leftover; + return state; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; const struct firmware *firmware; - int ret; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); + enum ucode_state ret; - ret = request_firmware(&firmware, fw_name, device); - if (ret) { + if (request_firmware(&firmware, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { printk(KERN_INFO "microcode: AMD microcode update via " "/dev/cpu/microcode not supported\n"); - return -1; + return UCODE_ERROR; } static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d1..9c4461501fcb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@ * Thanks to Stuart Swales for pointing out this bug. */ #include <linux/platform_device.h> -#include <linux/capability.h> #include <linux/miscdevice.h> -#include <linux/firmware.h> +#include <linux/capability.h> #include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> #include <linux/cpu.h> #include <linux/fs.h> #include <linux/mm.h> #include <asm/microcode.h> #include <asm/processor.h> -#include <asm/msr.h> MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + #ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - old = current->cpus_allowed; - for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); } -out: - set_cpus_allowed_ptr(current, &old); + return error; } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } get_online_cpus(); mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -182,9 +245,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; mutex_lock(µcode_mutex); if (uci->valid) { - err = microcode_ops->request_microcode_fw(smp_processor_id(), - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); + enum ucode_state ustate; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; } mutex_unlock(µcode_mutex); + return err; } static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t sz) + const char *buf, size_t size) { - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; + unsigned long val; int cpu = dev->id; + int ret = 0; + char *end; + val = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (val == 1) { get_online_cpus(); if (cpu_online(cpu)) - err = work_on_cpu(cpu, reload_for_cpu, NULL); + ret = reload_for_cpu(cpu); put_online_cpus(); } - if (err) - return err; - return sz; + + if (!ret) + ret = size; + + return ret; } static ssize_t version_show(struct sys_device *dev, @@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu) uci->valid = 0; } -static void microcode_fini_cpu(int cpu) -{ - mutex_lock(µcode_mutex); - __microcode_fini_cpu(cpu); - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; + if (!uci->mc) + return UCODE_NFOUND; + + pr_debug("microcode: CPU%d updated upon resume\n", cpu); + apply_microcode_on_target(cpu); + + return UCODE_OK; } -static int microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; + enum ucode_state ustate; - pr_debug("microcode: CPU%d resumed\n", cpu); + if (collect_cpu_info(cpu)) + return UCODE_ERROR; - if (!uci->mc) - return 1; + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", - cpu); - return -1; - } + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); - if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", - cpu); - /* Should we look for a new ucode here? */ - return 1; + if (ustate == UCODE_OK) { + pr_debug("microcode: CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); } - return 0; + return ustate; } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_update_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); - int err = 0; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(smp_processor_id()); - } else { - collect_cpu_info(smp_processor_id()); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw( - smp_processor_id(), - µcode_pdev->dev); - } - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); - return err; -} + if (uci->valid) + ustate = microcode_resume_cpu(cpu); + else + ustate = microcode_init_cpu(cpu); -static int microcode_init_cpu(int cpu) -{ - int err; - mutex_lock(µcode_mutex); - err = work_on_cpu(cpu, microcode_update_cpu, NULL); - mutex_unlock(µcode_mutex); - - return err; + return ustate; } static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) return err; - err = microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu) == UCODE_ERROR) + err = -EINVAL; return err; } @@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) static int mc_sysdev_resume(struct sys_device *dev) { int cpu = dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(NULL); + /* + * All non-bootup cpus are still disabled, + * so only CPU 0 will apply ucode here. + * + * Moreover, there can be no concurrent + * updates from any other places at this point. + */ + WARN_ON(cpu != 0); + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + return 0; } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (microcode_init_cpu(cpu)) - printk(KERN_ERR "microcode: failed to init CPU%d\n", - cpu); + microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); + pr_err("microcode: Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +508,10 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { @@ -480,23 +520,31 @@ static int __init microcode_init(void) } get_online_cpus(); + mutex_lock(µcode_mutex); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); + if (error) { - microcode_dev_exit(); platform_device_unregister(microcode_pdev); return error; } + error = microcode_dev_init(); + if (error) + return error; + register_hotcpu_notifier(&mc_cpu_notifier); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { @@ -505,16 +553,17 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); get_online_cpus(); + mutex_lock(µcode_mutex); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); platform_device_unregister(microcode_pdev); microcode_ops = NULL; - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1ab..0d334ddd0a96 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h> #include <linux/firmware.h> -#include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/fs.h> -#include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -150,13 +137,9 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - spin_unlock_irqrestore(µcode_update_lock, flags); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - csig->sig, csig->pf, csig->rev); + printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) return 0; } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; - unsigned long flags; unsigned int val[2]; int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu) BUG_ON(cpu_num != cpu); if (mc_intel == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", - cpu_num, uci->cpu_sig.rev, val[1]); - return; + printk(KERN_ERR "microcode: CPU%d update " + "to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->cpu_sig.rev, val[1], + printk(KERN_INFO "microcode: CPU%d updated to revision " + "0x%x, date = %04x-%02x-%02x \n", + cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + + return 0; } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u8 *ucode_ptr = data, *new_mc = NULL, *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; + enum ucode_state state = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (!new_mc) + if (leftover) { + if (new_mc) + vfree(new_mc); + state = UCODE_ERROR; goto out; + } - if (leftover) { - vfree(new_mc); + if (!new_mc) { + state = UCODE_NFOUND; goto out; } @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - - out: - return (int)leftover; +out: + return state; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - ret = request_firmware(&firmware, name, device); - if (ret) { + + if (request_firmware(&firmware, name, device)) { pr_debug("microcode: data file %s load failed\n", name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) return copy_from_user(to, from, n); } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e414c15..651c93b28862 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -17,6 +17,7 @@ #include <linux/acpi.h> #include <linux/module.h> #include <linux/smp.h> +#include <linux/pci.h> #include <asm/mtrr.h> #include <asm/mpspec.h> @@ -870,24 +871,17 @@ static inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, - int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - if (!mpc_new_phys) { - pr_info("No spare slots, try to append...take your risk, " - "new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - pr_info("No spare slots, try to append..., " - "new mpc_length %x\n", count); - else { - pr_err("mpc_new_length %lx is too small\n", - mpc_new_length); - return -1; - } + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; } - return 0; + return ret; } static int __init replace_intsrc_all(struct mpc_table *mpc, @@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!check_slot(mpc_new_phys, mpc_new_length, count)) + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; @@ -963,11 +957,14 @@ out: return 0; } -static int __initdata enable_update_mptable; +int enable_update_mptable; static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif return 0; } early_param("update_mptable", update_mptable_setup); @@ -980,6 +977,9 @@ static int __initdata alloc_mptable; static int __init parse_alloc_mptable_opt(char *p) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif alloc_mptable = 1; if (!p) return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9faf43bea336..70ec9b951d76 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - BUG_ON(preemptible()); + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - __get_cpu_var(paravirt_lazy_mode) = mode; + percpu_write(paravirt_lazy_mode, mode); } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); - BUG_ON(preemptible()); + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); - __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void) void paravirt_leave_lazy_mmu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_MMU); + leave_lazy(PARAVIRT_LAZY_MMU); } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev) { + BUG_ON(preemptible()); + + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } enter_lazy(PARAVIRT_LAZY_CPU); } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next) { - paravirt_leave_lazy(PARAVIRT_LAZY_CPU); + BUG_ON(preemptible()); + + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { - return __get_cpu_var(paravirt_lazy_mode); + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + + return percpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) @@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } @@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_enable(); } -void arch_flush_lazy_cpu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { - WARN_ON(preempt_count() == 1); - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = { .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - }, + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f3..971a3bec47a8 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; - unsigned long badbit; unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long badbit; unsigned long badend; unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void) iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035c..cfd9f9063896 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void) } #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0);\ - } while (0) - -#define CLEAR_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; \ - } while (0) - /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; static void dump_leak(void) { - int i; static int dump; - if (dump || !iommu_leak_tab) + if (dump) return; dump = 1; - show_stack(NULL, NULL); - /* Very crude. dump some from the end of the table too */ - printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", - iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i += 2) { - printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], - 0); - printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk(KERN_DEBUG "\n"); + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); phys_mem += PAGE_SIZE; } return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } @@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - enable_gart_translations(); - error = sysdev_class_register(&gart_sysdev_class); if (!error) error = sysdev_register(&device_gart); @@ -801,11 +769,12 @@ void __init gart_iommu_init(void) #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(iommu_pages*sizeof(void *))); - if (!iommu_leak_tab) + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) printk(KERN_DEBUG - "PCI-DMA: Cannot allocate leak trace area\n"); + "PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -845,6 +814,14 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); /* * Try to workaround a bug (thanks to BenH): diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 221a3853e268..a1712f2b50f1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e847..3bb2be1649bd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,12 +8,15 @@ #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <linux/random.h> #include <trace/power.h> #include <asm/system.h> #include <asm/apic.h> +#include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> #include <asm/i387.h> +#include <asm/ds.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } void free_thread_info(struct thread_info *ti) @@ -83,8 +88,6 @@ void exit_thread(void) put_cpu(); kfree(bp); } - - ds_exit_thread(current); } void flush_thread(void) @@ -613,3 +616,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a2..59f4524984af 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <stdarg.h> - #include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> @@ -33,7 +31,6 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/ptrace.h> -#include <linux/random.h> #include <linux/personality.h> #include <linux/tick.h> #include <linux/percpu.h> @@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_max = 0; } - ds_copy_thread(p, current); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; @@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the @@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b1..ebefb5407b9d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <stdarg.h> - #include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> @@ -32,7 +30,6 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/ptrace.h> -#include <linux/random.h> #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> @@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, goto out; } - ds_copy_thread(p, me); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; @@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* * Switch FS and GS. @@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 23b7c8f017e2..09ecbde91c13 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/seccomp.h> #include <linux/signal.h> +#include <linux/workqueue.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS +/* + * A branch trace store context. + * + * Contexts may only be installed by ptrace_bts_config() and only for + * ptraced tasks. + * + * Contexts are destroyed when the tracee is detached from the tracer. + * The actual destruction work requires interrupts enabled, so the + * work is deferred and will be scheduled during __ptrace_unlink(). + * + * Contexts hold an additional task_struct reference on the traced + * task, as well as a reference on the tracer's mm. + * + * Ptrace already holds a task_struct for the duration of ptrace operations, + * but since destruction is deferred, it may be executed after both + * tracer and tracee exited. + */ +struct bts_context { + /* The branch trace handle. */ + struct bts_tracer *tracer; + + /* The buffer used to store the branch trace and its size. */ + void *buffer; + unsigned int size; + + /* The mm that paid for the above buffer. */ + struct mm_struct *mm; + + /* The task this context belongs to. */ + struct task_struct *task; + + /* The signal to send on a bts buffer overflow. */ + unsigned int bts_ovfl_signal; + + /* The work struct to destroy a context. */ + struct work_struct work; +}; + +static int alloc_bts_buffer(struct bts_context *context, unsigned int size) +{ + void *buffer = NULL; + int err = -ENOMEM; + + err = account_locked_memory(current->mm, current->signal->rlim, size); + if (err < 0) + return err; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out_refund; + + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + + return 0; + + out_refund: + refund_locked_memory(current->mm, size); + return err; +} + +static inline void free_bts_buffer(struct bts_context *context) +{ + if (!context->buffer) + return; + + kfree(context->buffer); + context->buffer = NULL; + + refund_locked_memory(context->mm, context->size); + context->size = 0; + + mmput(context->mm); + context->mm = NULL; +} + +static void free_bts_context_work(struct work_struct *w) +{ + struct bts_context *context; + + context = container_of(w, struct bts_context, work); + + ds_release_bts(context->tracer); + put_task_struct(context->task); + free_bts_buffer(context); + kfree(context); +} + +static inline void free_bts_context(struct bts_context *context) +{ + INIT_WORK(&context->work, free_bts_context_work); + schedule_work(&context->work); +} + +static inline struct bts_context *alloc_bts_context(struct task_struct *task) +{ + struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); + if (context) { + context->task = task; + task->bts = context; + + get_task_struct(task); + } + + return context; +} + static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; struct bts_struct bts; const unsigned char *at; int error; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; at = trace->ds.top - ((index + 1) * trace->ds.size); if ((void *)at < trace->ds.begin) @@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (!trace->read) return -EOPNOTSUPP; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; const unsigned char *at; int error, drained = 0; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; if (!trace->read) return -EOPNOTSUPP; @@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child, for (at = trace->ds.begin; (void *)at < trace->ds.top; out++, drained++, at += trace->ds.size) { struct bts_struct bts; - int error; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child, memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - error = ds_reset_bts(child->bts); + error = ds_reset_bts(context->tracer); if (error < 0) return error; return drained; } -static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) -{ - child->bts_buffer = alloc_locked_buffer(size); - if (!child->bts_buffer) - return -ENOMEM; - - child->bts_size = size; - - return 0; -} - -static void ptrace_bts_free_buffer(struct task_struct *child) -{ - free_locked_buffer(child->bts_buffer, child->bts_size); - child->bts_buffer = NULL; - child->bts_size = 0; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; struct ptrace_bts_config cfg; unsigned int flags = 0; @@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child, if (copy_from_user(&cfg, ucfg, sizeof(cfg))) return -EFAULT; - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - } + context = child->bts; + if (!context) + context = alloc_bts_context(child); + if (!context) + return -ENOMEM; if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) return -EINVAL; - child->thread.bts_ovfl_signal = cfg.signal; return -EOPNOTSUPP; + context->bts_ovfl_signal = cfg.signal; } - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && - (cfg.size != child->bts_size)) { - int error; + ds_release_bts(context->tracer); + context->tracer = NULL; - ptrace_bts_free_buffer(child); + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + int err; - error = ptrace_bts_allocate_buffer(child, cfg.size); - if (error < 0) - return error; + free_bts_buffer(context); + if (!cfg.size) + return 0; + + err = alloc_bts_buffer(context, cfg.size); + if (err < 0) + return err; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - flags); - if (IS_ERR(child->bts)) { - int error = PTR_ERR(child->bts); - - ptrace_bts_free_buffer(child); - child->bts = NULL; + context->tracer = + ds_request_bts_task(child, context->buffer, context->size, + NULL, (size_t)-1, flags); + if (unlikely(IS_ERR(context->tracer))) { + int error = PTR_ERR(context->tracer); + free_bts_buffer(context); + context->tracer = NULL; return error; } @@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; const struct bts_trace *trace; struct ptrace_bts_config cfg; + context = child->bts; + if (!context) + return -ESRCH; + if (cfg_size < sizeof(cfg)) return -EIO; - trace = ds_read_bts(child->bts); + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = child->thread.bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); + cfg.size = trace->ds.end - trace->ds.begin; + cfg.signal = context->bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; @@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child, static int ptrace_bts_clear(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - return ds_reset_bts(child->bts); + return ds_reset_bts(context->tracer); } static int ptrace_bts_size(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static void ptrace_bts_fork(struct task_struct *tsk) -{ - tsk->bts = NULL; - tsk->bts_buffer = NULL; - tsk->bts_size = 0; - tsk->thread.bts_ovfl_signal = 0; -} - -static void ptrace_bts_untrace(struct task_struct *child) +/* + * Called from __ptrace_unlink() after the child has been moved back + * to its original parent. + */ +void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { - ds_release_bts(child->bts); + free_bts_context(child->bts); child->bts = NULL; - - /* We cannot update total_vm and locked_vm since - child's mm is already gone. But we can reclaim the - memory. */ - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; } } - -static void ptrace_bts_detach(struct task_struct *child) -{ - /* - * Ptrace_detach() races with ptrace_untrace() in case - * the child dies and is reaped by another thread. - * - * We only do the memory accounting at this point and - * leave the buffer deallocation and the bts tracer - * release to ptrace_bts_untrace() which will be called - * later on with tasklist_lock held. - */ - release_locked_buffer(child->bts_buffer, child->bts_size); -} -#else -static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_detach(struct task_struct *child) {} -static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ -void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - ptrace_bts_fork(child); -} - -void x86_ptrace_untrace(struct task_struct *child) -{ - ptrace_bts_untrace(child); -} - /* * Called by kernel/ptrace.c when detaching.. * @@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 7563b31b4f03..af71d06624bf 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,5 +491,42 @@ void force_hpet_resume(void) break; } } +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(dev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 667188e0b5a0..d2d1ce8170f0 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf63..d1c636bf31a7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@ #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; @@ -214,8 +222,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; @@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); + reserve_brk(); /* max_pfn_mapped is updated here */ @@ -997,24 +1015,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 /** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - -/** * x86_quirk_intr_init - post gate setup interrupt initialisation * * Description: diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8f0e13be36b3..9c3f0823e6aa 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) + /* + * make sure boot cpu node_number is right, when boot cpu is on the + * node that doesn't have mem installed + */ + per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif + /* Setup node to cpumask map */ setup_node_to_cpumask_map(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..0a813b17b172 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ - #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8ccaa..28f5fb495a66 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ } void smp_call_function_interrupt(struct pt_regs *regs) @@ -193,19 +196,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) } struct smp_ops smp_ops = { - .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, - .smp_prepare_cpus = native_smp_prepare_cpus, - .smp_cpus_done = native_smp_cpus_done, + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, - .smp_send_reschedule = native_smp_send_reschedule, + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, - .cpu_disable = native_cpu_disable, - .play_dead = native_play_dead, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, - .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d8..7c80007ea5f7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __devinit +int __cpuinit wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -int __devinit +static int __cpuinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -822,10 +822,12 @@ do_rest: /* mark "stuck" area as not stuck */ *((volatile unsigned long *)trampoline_base) = 0; - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } return boot_error; } @@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk(KERN_ERR "... forcing use of dummy APIC emulation." + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation." "(tell your hw vendor)\n"); + } smpboot_clear_io_apic(); arch_disable_smp_support(); return -1; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index f7bddc2e37d1..4aaf7e48394f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) static int save_stack_stack(void *data, char *name) { - return -1; + return 0; } static void save_stack_address(void *data, unsigned long addr, int reliable) diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b491..d51321ddafda 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,5 @@ ENTRY(sys_call_table) .long sys_inotify_init1 .long sys_preadv .long sys_pwritev + .long sys_rt_tgsigqueueinfo /* 335 */ + .long sys_perf_counter_open diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8c7b03b0cfcb..124d40c575df 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode) struct bau_desc *adp; struct bau_desc *ad2; - adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); + /* + * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + */ + adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* + UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ @@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode) (n << UV_DESC_BASE_PNODE_SHIFT | m)); } - for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { + /* + * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * cpu even though we only use the first one; one descriptor can + * describe a broadcast to 256 nodes. + */ + for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); + i++, ad2++) { memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..07d60c870ce2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } @@ -945,8 +942,13 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 @@ -963,17 +965,9 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); -#endif - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_X86_64 - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else set_bit(SYSCALL_VECTOR, used_vectors); #endif + /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05dc430..3e1c057e98fe 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); @@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs) #ifdef CONFIG_X86_64 static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)vget_cycles(); + cycle_t ret; + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + rdtsc_barrier(); return ret >= __vsyscall_gtod_data.clock.cycle_last ? ret : __vsyscall_gtod_data.clock.cycle_last; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef9..027b5b498993 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; * of a critical section, to be able to prove TSC time-warps: */ static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk(KERN_INFO - "Skipping synchronization checks as TSC is reliable.\n"); + pr_info("Skipping synchronization checks as TSC is reliable.\n"); return; } - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); + pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); /* * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) if (nr_warps) { printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } -#undef NR_LOOPS - diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1c..9c4e62539058 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs.pt.ds = 0; info->regs.pt.es = 0; info->regs.pt.fs = 0; - -/* we are clearing gs later just before "jmp resume_userspace", - * because it is not saved/restored. - */ +#ifndef CONFIG_X86_32_LAZY_GS + info->regs.pt.gs = 0; +#endif /* * The flags register is also special: we cannot trust that the user @@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = 0; + info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); @@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS "mov %2, %%gs\n\t" +#endif "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95deb9f2211e..b263423fbe2a 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev) { - paravirt_enter_lazy_cpu(); + paravirt_start_context_switch(prev); vmi_ops.set_lazy_mode(2); } +static void vmi_end_context_switch(struct task_struct *next) +{ + vmi_ops.set_lazy_mode(0); + paravirt_end_context_switch(next); +} + static void vmi_enter_lazy_mmu(void) { paravirt_enter_lazy_mmu(); vmi_ops.set_lazy_mode(1); } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_mmu(); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -711,14 +717,14 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); - para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f013..4c85b2e2bb65 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,431 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org> + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + #ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" +#define LOAD_OFFSET __PAGE_OFFSET #else -# include "vmlinux_64.lds.S" +#define LOAD_OFFSET __START_KERNEL_map #endif + +#include <asm-generic/vmlinux.lds.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/page_types.h> +#include <asm/cache.h> +#include <asm/boot.h> + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(7); /* RWE */ +#endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} + +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + + /* Text and read-only data */ + + /* bootstrapping code */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* The rest of the text */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) +#endif + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + + /* Data */ + . = ALIGN(PAGE_SIZE); + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + +#ifdef CONFIG_X86_64 + /* End of data section */ + _edata = .; +#endif + } :data + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } +#endif + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +#endif + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +#endif + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + +#ifdef CONFIG_X86_32 + /* End of data section */ + _edata = .; +#endif + } + +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); + + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } + + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); + + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } + + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ + + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } +#ifdef CONFIG_X86_64 + :data.init +#endif + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) +#else + PERCPU(PAGE_SIZE) +#endif + + . = ALIGN(PAGE_SIZE); + + /* freed after init ends here */ + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ +#endif + + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } + + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = .; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG +} + + +#ifdef CONFIG_X86_32 +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index 62ad500d55f3..000000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,229 +0,0 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include <asm-generic/vmlinux.lds.h> -#include <asm/thread_info.h> -#include <asm/page_types.h> -#include <asm/cache.h> -#include <asm/boot.h> - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} -SECTIONS -{ - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - /* writeable */ - . = ALIGN(PAGE_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - DATA_DATA - CONSTRUCTORS - } :data - - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - _edata = .; /* End of data section */ - } - - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } - - STABS_DEBUG - - DWARF_DEBUG -} - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index c8742507b030..000000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,298 +0,0 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include <asm-generic/vmlinux.lds.h> -#include <asm/asm-offsets.h> -#include <asm/page_types.h> - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ -#endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - _edata = .; /* End of data section */ - } :data - - - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } - - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); /* init_task */ - *(.data.init_task) - }:data.init - - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } - - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - -#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else - PERCPU(PAGE_SIZE) -#endif - - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - - STABS_DEBUG - - DWARF_DEBUG -} - - /* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153afc9067..25ee06a80aad 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) return; } - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); now = vread(); - rdtsc_barrier(); - base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; |