From 1d3381ebf42de1b6f8c118732893cb5bdc37edcd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Mar 2008 16:59:12 -0700 Subject: x86: convert mtrr/generic.c to kernel-doc Convert function comment blocks to kernel-doc notation. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/generic.c | 42 +++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 103d61a59b19..3e18db4cefee 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -176,12 +176,13 @@ static inline void k8_enable_fixed_iorrs(void) } /** - * Checks and updates an fixed-range MTRR if it differs from the value it - * should have. If K8 extentions are wanted, update the K8 SYSCFG MSR also. - * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information - * \param msr MSR address of the MTTR which should be checked and updated - * \param changed pointer which indicates whether the MTRR needed to be changed - * \param msrwords pointer to the MSR values which the MSR should have + * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * @msr: MSR address of the MTTR which should be checked and updated + * @changed: pointer which indicates whether the MTRR needed to be changed + * @msrwords: pointer to the MSR values which the MSR should have + * + * If K8 extentions are wanted, update the K8 SYSCFG MSR also. + * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information. */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { @@ -199,12 +200,15 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) } } +/** + * generic_get_free_region - Get a free MTRR. + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * @replace_reg: mtrr index to be replaced; set to invalid value if none. + * + * Returns: The index of the region on success, else negative on error. + */ int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { int i, max; mtrr_type ltype; @@ -249,8 +253,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, } /** - * Checks and updates the fixed-range MTRRs if they differ from the saved set - * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ static int set_fixed_ranges(mtrr_type * frs) { @@ -294,13 +298,13 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) static u32 deftype_lo, deftype_hi; +/** + * set_mtrr_state - Set the MTRR state for this CPU. + * + * NOTE: The CPU must already be in a safe state for MTRR changes. + * RETURNS: 0 if no changes made, else a mask indicating what was changed. + */ static unsigned long set_mtrr_state(void) -/* [SUMMARY] Set the MTRR state for this CPU. - The MTRR state information to read. - Some relevant CPU context. - [NOTE] The CPU must already be in a safe state for MTRR changes. - [RETURNS] 0 if no changes made, else a mask indication what was changed. -*/ { unsigned int i; unsigned long change_mask = 0; -- cgit v1.2.3 From 3c274c2909e17aa0afeded4cd4520b7357357ca0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 21 Mar 2008 10:06:32 +0100 Subject: x86: add dmi quirk for io_delay reported by mereandor@gmail.com, in: http://bugzilla.kernel.org/show_bug.cgi?id=6307 Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_delay.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index c706a3061553..5921e5f0a640 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -76,6 +76,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30B9") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, { .callback = dmi_io_delay_0xed_port, .ident = "HP Pavilion tx1000", -- cgit v1.2.3 From 475613b9e374bf0c15340eb166a962da04aa02e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Feb 2008 23:23:09 -0800 Subject: x86: fix memoryless node oops during boot fix oops during boot reported in this thread: http://lkml.org/lkml/2008/2/6/65 enable booting on memoryless nodes. Reported-by: Kamalesh Babulal Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 7637dc91c79b..f4f7ecfb898c 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -801,7 +801,7 @@ static void __cpuinit srat_detect_node(void) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE || !node_online(node)) node = first_node(node_online_map); numa_set_node(cpu, node); -- cgit v1.2.3 From c6e8256a7b15033bc5d7797e25c7e053040c4c7c Mon Sep 17 00:00:00 2001 From: Stephan Diestelhorst Date: Mon, 10 Mar 2008 16:05:41 +0100 Subject: x86, cpufreq: fix Speedfreq-SMI call that clobbers ECX I have found that using SMI to change the cpu's frequency on my DELL Latitude L400 clobbers the ECX register in speedstep_set_state, causing unneccessary retries because the "state" variable has changed silently (GCC assumes it is still present in ECX). play safe and avoid gcc caching any register across IO port accesses that trigger SMIs. Signed-off by: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 39 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index f2b5a621d27b..8a85c93bd62a 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -63,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = { */ static int speedstep_smi_ownership (void) { - u32 command, result, magic; + u32 command, result, magic, dummy; u32 function = GET_SPEEDSTEP_OWNER; unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; @@ -73,8 +73,11 @@ static int speedstep_smi_ownership (void) dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port); __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=D" (result) + "pop %%ebp\n" + : "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), + "=S" (dummy) : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic) : "memory" @@ -96,7 +99,7 @@ static int speedstep_smi_ownership (void) */ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) { - u32 command, result = 0, edi, high_mhz, low_mhz; + u32 command, result = 0, edi, high_mhz, low_mhz, dummy; u32 state=0; u32 function = GET_SPEEDSTEP_FREQS; @@ -109,10 +112,12 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); @@ -135,16 +140,18 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) static int speedstep_get_state (void) { u32 function=GET_SPEEDSTEP_STATE; - u32 result, state, edi, command; + u32 result, state, edi, command, dummy; command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0) + "pop %%ebp\n" + : "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0) ); dprintk("state is %x, result is %x\n", state, result); @@ -160,7 +167,7 @@ static int speedstep_get_state (void) */ static void speedstep_set_state (unsigned int state) { - unsigned int result = 0, command, new_state; + unsigned int result = 0, command, new_state, dummy; unsigned long flags; unsigned int function=SET_SPEEDSTEP_STATE; unsigned int retry = 0; @@ -182,10 +189,12 @@ static void speedstep_set_state (unsigned int state) } retry++; __asm__ __volatile__( - "movl $0, %%edi\n" + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=b" (new_state), "=D" (result) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy), + "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); } while ((new_state != state) && (retry <= SMI_TRIES)); @@ -195,7 +204,7 @@ static void speedstep_set_state (unsigned int state) if (new_state == state) { dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); } else { - printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result); + printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result); } return; -- cgit v1.2.3 From 923a0cf82f2b504e316642e2d152d38b6c0be4ba Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 26 Mar 2008 14:13:01 -0400 Subject: x86: GEODE: add missing module.h include On Wed, 26 Mar 2008 11:56:22 -0600 Jordan Crouse wrote: > On 26/03/08 14:31 +0100, Stefan Pfetzing wrote: > > Hello Jordan, > > > > I just tried to build your geodwdt driver for the geode watchdog. Therefore > > I pulled your repository from http://git.infradead.org/geode.git (or more, > > the git url). > > > > I tried to build the geodewdt driver as a module - which didn't work, and > > it failed with the same problem as earlier mentioned on lkmk [1]. I also > > checked the fix [2], but that seems to be already in your (or linus) tree - > > and so I'm unsure what the problem is. > > > > [1] http://kerneltrap.org/mailarchive/linux-kernel/2008/2/17/884074 > > [2] http://kerneltrap.org/mailarchive/linux-kernel/2008/2/17/884174 > > > > Building directly into the kernel seems to work. > > > > Maybe you have some idea? > > Hmm - that is strange. Exporting the symbols should work. I recommend > starting over with a clean tree. > > CCing Andres - any thoughts? > > Jordan > Er, yeah. The patch below should fix it. This should probably go into 2.6.25. Oops, EXPORT_SYMBOL_GPL wasn't being declared due to this header being missing. Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar --- arch/x86/kernel/mfgpt_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 027fc067b399..b402c0f3f192 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -30,6 +30,7 @@ #include #include +#include #include static struct mfgpt_timer_t { -- cgit v1.2.3 From 76c324182bbd29dfe4298ca65efb15be18055df1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 23 Mar 2008 00:16:49 -0700 Subject: x86: fix trim mtrr not to setup_memory two times we could call find_max_pfn() directly instead of setup_memory() to get max_pfn needed for mtrr trimming. otherwise setup_memory() is called two times... that is duplicated... [ mingo@elte.hu: both Thomas and me simulated a double call to setup_bootmem_allocator() and can confirm that it is a real bug which can hang in certain configs. It's not been reported yet but that is probably due to the relatively scarce nature of MTRR-trimming systems. ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 9 ++++----- arch/x86/mm/discontig_32.c | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index a1d7071a51c9..2b3e5d45176b 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -406,8 +406,6 @@ static unsigned long __init setup_memory(void) */ min_low_pfn = PFN_UP(init_pg_tables_end); - find_max_pfn(); - max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM @@ -764,12 +762,13 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); - max_low_pfn = setup_memory(); - /* update e820 for memory not covered by WB MTRRs */ + find_max_pfn(); mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_low_pfn = setup_memory(); + find_max_pfn(); + + max_low_pfn = setup_memory(); #ifdef CONFIG_VMI /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c394ca0720b8..8e25e06ff730 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -324,7 +324,6 @@ unsigned long __init setup_memory(void) * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ - find_max_pfn(); get_memcfg_numa(); kva_pages = calculate_numa_remap_pages(); -- cgit v1.2.3 From d546b67a940eb42a99f56b86c5cd8d47c8348c2a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 25 Mar 2008 17:39:12 -0700 Subject: x86: fix performance drop for glx fix the 3D performance drop reported at: http://bugzilla.kernel.org/show_bug.cgi?id=10328 fb drivers are using ioremap()/ioremap_nocache(), followed by mtrr_add with WC attribute. Recent changes in page attribute code made both ioremap()/ioremap_nocache() mappings as UC (instead of previous UC-). This breaks the graphics performance, as the effective memory type is UC instead of expected WC. The correct way to fix this is to add ioremap_wc() (which uses UC- in the absence of PAT kernel support and WC with PAT) and change all the fb drivers to use this new ioremap_wc() API. We can take this correct and longer route for post 2.6.25. For now, revert back to the UC- behavior for ioremap/ioremap_nocache. Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Cc: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 6 +++++- arch/x86/mm/pageattr.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4afaba0ed722..794895c6dcc9 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -137,7 +137,11 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, switch (mode) { case IOR_MODE_UNCACHED: default: - prot = PAGE_KERNEL_NOCACHE; + /* + * FIXME: we will use UC MINUS for now, as video fb drivers + * depend on it. Upcoming ioremap_wc() will fix this behavior. + */ + prot = PAGE_KERNEL_UC_MINUS; break; case IOR_MODE_CACHED: prot = PAGE_KERNEL; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 14e48b5a94ba..7b79f6be4e7d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -771,7 +771,7 @@ static inline int change_page_attr_clear(unsigned long addr, int numpages, int set_memory_uc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_PCD)); } EXPORT_SYMBOL(set_memory_uc); -- cgit v1.2.3 From bc713dcf35c427ae8377fb9a4d1b7f891054ce13 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 15:58:28 +0100 Subject: x86: fix prefetch workaround some early Athlon XP's and Opterons generate bogus faults on prefetch instructions. The workaround for this regressed over .24 - reinstate it. Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fdc667422df9..c0c82bc143c9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -92,7 +92,8 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned char *max_instr; #ifdef CONFIG_X86_32 - if (!(__supported_pte_mask & _PAGE_NX)) + /* Catch an obscure case of prefetch inside an NX page: */ + if ((__supported_pte_mask & _PAGE_NX) && (error_code & 16)) return 0; #endif -- cgit v1.2.3 From d8d4f157b8d828bc837f0eb2ee4a2dd40dbdd572 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 4 Mar 2008 15:05:39 -0800 Subject: x86: ptrace.c: fix defined-but-unused warnings arch/x86/kernel/ptrace.c:548: warning: 'ptrace_bts_get_size' defined but not used arch/x86/kernel/ptrace.c:558: warning: 'ptrace_bts_read_record' defined but not used arch/x86/kernel/ptrace.c:607: warning: 'ptrace_bts_clear' defined but not used arch/x86/kernel/ptrace.c:617: warning: 'ptrace_bts_drain' defined but not used arch/x86/kernel/ptrace.c:720: warning: 'ptrace_bts_config' defined but not used arch/x86/kernel/ptrace.c:788: warning: 'ptrace_bts_status' defined but not used Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ptrace.c | 169 ++++++++++++++++++++++++----------------------- 1 file changed, 85 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d5904eef1d31..eb92ccbb3502 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -600,21 +600,6 @@ static int ptrace_bts_read_record(struct task_struct *child, return sizeof(ret); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) -{ - int retval; - - if (!child->thread.ds_area_msr) - return -ENXIO; - - retval = ds_write_bts((void *)child->thread.ds_area_msr, in); - if (retval) - return retval; - - return sizeof(*in); -} - static int ptrace_bts_clear(struct task_struct *child) { if (!child->thread.ds_area_msr) @@ -657,75 +642,6 @@ static int ptrace_bts_drain(struct task_struct *child, return end; } -static int ptrace_bts_realloc(struct task_struct *child, - int size, int reduce_size) -{ - unsigned long rlim, vm; - int ret, old_size; - - if (size < 0) - return -EINVAL; - - old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); - if (old_size < 0) - return old_size; - - ret = ds_free((void **)&child->thread.ds_area_msr); - if (ret < 0) - goto out; - - size >>= PAGE_SHIFT; - old_size >>= PAGE_SHIFT; - - current->mm->total_vm -= old_size; - current->mm->locked_vm -= old_size; - - if (size == 0) - goto out; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->total_vm; - if (size <= 0) - goto out; - } - - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->locked_vm; - if (size <= 0) - goto out; - } - - ret = ds_allocate((void **)&child->thread.ds_area_msr, - size << PAGE_SHIFT); - if (ret < 0) - goto out; - - current->mm->total_vm += size; - current->mm->locked_vm += size; - -out: - if (child->thread.ds_area_msr) - set_tsk_thread_flag(child, TIF_DS_AREA_MSR); - else - clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); - - return ret; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -828,6 +744,91 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } + +static int ptrace_bts_write_record(struct task_struct *child, + const struct bts_struct *in) +{ + int retval; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + retval = ds_write_bts((void *)child->thread.ds_area_msr, in); + if (retval) + return retval; + + return sizeof(*in); +} + +static int ptrace_bts_realloc(struct task_struct *child, + int size, int reduce_size) +{ + unsigned long rlim, vm; + int ret, old_size; + + if (size < 0) + return -EINVAL; + + old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); + if (old_size < 0) + return old_size; + + ret = ds_free((void **)&child->thread.ds_area_msr); + if (ret < 0) + goto out; + + size >>= PAGE_SHIFT; + old_size >>= PAGE_SHIFT; + + current->mm->total_vm -= old_size; + current->mm->locked_vm -= old_size; + + if (size == 0) + goto out; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->total_vm; + if (size <= 0) + goto out; + } + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->locked_vm; + if (size <= 0) + goto out; + } + + ret = ds_allocate((void **)&child->thread.ds_area_msr, + size << PAGE_SHIFT); + if (ret < 0) + goto out; + + current->mm->total_vm += size; + current->mm->locked_vm += size; + +out: + if (child->thread.ds_area_msr) + set_tsk_thread_flag(child, TIF_DS_AREA_MSR); + else + clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); + + return ret; +} + void ptrace_bts_take_timestamp(struct task_struct *tsk, enum bts_qualifier qualifier) { -- cgit v1.2.3 From b2ef749720a97053d60605a7456772a1752164cc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 26 Mar 2008 22:39:15 +0100 Subject: rdc321x: GPIO routines bugfixes This patch fixes the use of GPIO routines which are in the PCI configuration space of the RDC321x, therefore reading/writing to this space without spinlock protection can be problematic. We also now request and free GPIOs and support the MGB100 board, previous code was very AR525W-centric. Signed-off-by: Volker Weiss Signed-off-by: Florian Fainelli Signed-off-by: Ingo Molnar --- arch/x86/mach-rdc321x/gpio.c | 199 +++++++++++++++++++++++++++++---------- arch/x86/mach-rdc321x/platform.c | 2 + 2 files changed, 153 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c index 031269163bd6..247f33d3a407 100644 --- a/arch/x86/mach-rdc321x/gpio.c +++ b/arch/x86/mach-rdc321x/gpio.c @@ -1,91 +1,194 @@ /* - * Copyright (C) 2007, OpenWrt.org, Florian Fainelli - * RDC321x architecture specific GPIO support + * GPIO support for RDC SoC R3210/R8610 + * + * Copyright (C) 2007, Florian Fainelli + * Copyright (C) 2008, Volker Weiss + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. */ -#include -#include + +#include #include #include #include -#include +#include #include -static inline int rdc_gpio_is_valid(unsigned gpio) + +/* spin lock to protect our private copy of GPIO data register plus + the access to PCI conf registers. */ +static DEFINE_SPINLOCK(gpio_lock); + +/* copy of GPIO data registers */ +static u32 gpio_data_reg1; +static u32 gpio_data_reg2; + +static u32 gpio_request_data[2]; + + +static inline void rdc321x_conf_write(unsigned addr, u32 value) { - return (gpio <= RDC_MAX_GPIO); + outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); + outl(value, RDC3210_CFGREG_DATA); } -static unsigned int rdc_gpio_read(unsigned gpio) +static inline void rdc321x_conf_or(unsigned addr, u32 value) { - unsigned int val; - - val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48)); - outl(val, RDC3210_CFGREG_ADDR); - udelay(10); - val = inl(RDC3210_CFGREG_DATA); - val |= (0x1 << (gpio & 0x1F)); - outl(val, RDC3210_CFGREG_DATA); - udelay(10); - val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C)); - outl(val, RDC3210_CFGREG_ADDR); - udelay(10); - val = inl(RDC3210_CFGREG_DATA); - - return val; + outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); + value |= inl(RDC3210_CFGREG_DATA); + outl(value, RDC3210_CFGREG_DATA); } -static void rdc_gpio_write(unsigned int val) +static inline u32 rdc321x_conf_read(unsigned addr) { - if (val) { - outl(val, RDC3210_CFGREG_DATA); - udelay(10); - } + outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); + + return inl(RDC3210_CFGREG_DATA); } -int rdc_gpio_get_value(unsigned gpio) +/* configure pin as GPIO */ +static void rdc321x_configure_gpio(unsigned gpio) +{ + unsigned long flags; + + spin_lock_irqsave(&gpio_lock, flags); + rdc321x_conf_or(gpio < 32 + ? RDC321X_GPIO_CTRL_REG1 : RDC321X_GPIO_CTRL_REG2, + 1 << (gpio & 0x1f)); + spin_unlock_irqrestore(&gpio_lock, flags); +} + +/* initially setup the 2 copies of the gpio data registers. + This function must be called by the platform setup code. */ +void __init rdc321x_gpio_setup() +{ + /* this might not be, what others (BIOS, bootloader, etc.) + wrote to these registers before, but it's a good guess. Still + better than just using 0xffffffff. */ + + gpio_data_reg1 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG1); + gpio_data_reg2 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG2); +} + +/* determine, if gpio number is valid */ +static inline int rdc321x_is_gpio(unsigned gpio) +{ + return gpio <= RDC321X_MAX_GPIO; +} + +/* request GPIO */ +int rdc_gpio_request(unsigned gpio, const char *label) { - if (rdc_gpio_is_valid(gpio)) - return (int)rdc_gpio_read(gpio); - else + unsigned long flags; + + if (!rdc321x_is_gpio(gpio)) return -EINVAL; + + spin_lock_irqsave(&gpio_lock, flags); + if (gpio_request_data[(gpio & 0x20) ? 1 : 0] & (1 << (gpio & 0x1f))) + goto inuse; + gpio_request_data[(gpio & 0x20) ? 1 : 0] |= (1 << (gpio & 0x1f)); + spin_unlock_irqrestore(&gpio_lock, flags); + + return 0; +inuse: + spin_unlock_irqrestore(&gpio_lock, flags); + return -EINVAL; } -EXPORT_SYMBOL(rdc_gpio_get_value); +EXPORT_SYMBOL(rdc_gpio_request); -void rdc_gpio_set_value(unsigned gpio, int value) +/* release previously-claimed GPIO */ +void rdc_gpio_free(unsigned gpio) { - unsigned int val; + unsigned long flags; - if (!rdc_gpio_is_valid(gpio)) + if (!rdc321x_is_gpio(gpio)) return; - val = rdc_gpio_read(gpio); + spin_lock_irqsave(&gpio_lock, flags); + gpio_request_data[(gpio & 0x20) ? 1 : 0] &= ~(1 << (gpio & 0x1f)); + spin_unlock_irqrestore(&gpio_lock, flags); +} +EXPORT_SYMBOL(rdc_gpio_free); + +/* read GPIO pin */ +int rdc_gpio_get_value(unsigned gpio) +{ + u32 reg; + unsigned long flags; + + spin_lock_irqsave(&gpio_lock, flags); + reg = rdc321x_conf_read(gpio < 32 + ? RDC321X_GPIO_DATA_REG1 : RDC321X_GPIO_DATA_REG2); + spin_unlock_irqrestore(&gpio_lock, flags); - if (value) - val &= ~(0x1 << (gpio & 0x1F)); - else - val |= (0x1 << (gpio & 0x1F)); + return (1 << (gpio & 0x1f)) & reg ? 1 : 0; +} +EXPORT_SYMBOL(rdc_gpio_get_value); - rdc_gpio_write(val); +/* set GPIO pin to value */ +void rdc_gpio_set_value(unsigned gpio, int value) +{ + unsigned long flags; + u32 reg; + + reg = 1 << (gpio & 0x1f); + if (gpio < 32) { + spin_lock_irqsave(&gpio_lock, flags); + if (value) + gpio_data_reg1 |= reg; + else + gpio_data_reg1 &= ~reg; + rdc321x_conf_write(RDC321X_GPIO_DATA_REG1, gpio_data_reg1); + spin_unlock_irqrestore(&gpio_lock, flags); + } else { + spin_lock_irqsave(&gpio_lock, flags); + if (value) + gpio_data_reg2 |= reg; + else + gpio_data_reg2 &= ~reg; + rdc321x_conf_write(RDC321X_GPIO_DATA_REG2, gpio_data_reg2); + spin_unlock_irqrestore(&gpio_lock, flags); + } } EXPORT_SYMBOL(rdc_gpio_set_value); +/* configure GPIO pin as input */ int rdc_gpio_direction_input(unsigned gpio) { + if (!rdc321x_is_gpio(gpio)) + return -EINVAL; + + rdc321x_configure_gpio(gpio); + return 0; } EXPORT_SYMBOL(rdc_gpio_direction_input); +/* configure GPIO pin as output and set value */ int rdc_gpio_direction_output(unsigned gpio, int value) { + if (!rdc321x_is_gpio(gpio)) + return -EINVAL; + + gpio_set_value(gpio, value); + rdc321x_configure_gpio(gpio); + return 0; } EXPORT_SYMBOL(rdc_gpio_direction_output); - - diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c index dda6024a5862..a037041817c7 100644 --- a/arch/x86/mach-rdc321x/platform.c +++ b/arch/x86/mach-rdc321x/platform.c @@ -62,6 +62,8 @@ static struct platform_device *rdc321x_devs[] = { static int __init rdc_board_setup(void) { + rdc321x_gpio_setup(); + return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs)); } -- cgit v1.2.3 From 25e59881f109dc6378ebc463ae4c2de907435de3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 26 Mar 2008 21:03:04 -0700 Subject: x86: stricter check in follow_huge_addr() The first page of the compound page is determined in follow_huge_addr() but then PageCompound() only checks if the page is part of a compound page. PageHead() allows checking if this is indeed the first page of the compound. Cc: Jeremy Fitzhardinge Signed-off-by: Christoph Lameter Signed-off-by: Ingo Molnar --- arch/x86/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 4fbafb4bc2f0..0b3d567e686d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -178,7 +178,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - WARN_ON(!PageCompound(page)); + WARN_ON(!PageHead(page)); return page; } -- cgit v1.2.3 From 04c44a080d2f699a3042d4e743f7ad2ffae9d538 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:52 -0700 Subject: xen: fix RMW when unmasking events xen_irq_enable_direct and xen_sysexit were using "andw $0x00ff, XEN_vcpu_info_pending(vcpu)" to unmask events and test for pending ones in one instuction. Unfortunately, the pending flag must be modified with a locked operation since it can be set by another CPU, and the unlocked form of this operation was causing the pending flag to get lost, allowing the processor to return to usermode with pending events and ultimately deadlock. The simple fix would be to make it a locked operation, but that's rather costly and unnecessary. The fix here is to split the mask-clearing and pending-testing into two instructions; the interrupt window between them is of no concern because either way pending or new events will be processed. This should fix lingering bugs in using direct vcpu structure access too. [ Stable: needed in 2.6.24.x ] Signed-off-by: Jeremy Fitzhardinge Cc: Stable Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/xen-asm.S | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8b9ee27805fd..1a20318c8cff 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -95,7 +95,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 0; +static int have_vcpu_info_placement = 1; static void __init xen_vcpu_setup(int cpu) { diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1a43b60c0c62..6b7190449d07 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -33,12 +33,17 @@ events, then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) - /* Clear mask and test pending */ - andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with any pending interrupts. The pending check may end up being run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending jz 1f + 2: call check_events 1: ENDPATCH(xen_irq_enable_direct) -- cgit v1.2.3 From 2e8fe719b57bbdc9e313daed1204bb55fed3ed44 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:53 -0700 Subject: xen: fix UP setup of shared_info We need to set up the shared_info pointer once we've mapped the real shared_info into its fixmap slot. That needs to happen once the general pagetable setup has been done. Previously, the UP shared_info was set up one in xen_start_kernel, but that was left pointing to the dummy shared info. Unfortunately there's no really good place to do a later setup of the shared_info in UP, so just do it once the pagetable setup has been done. [ Stable: needed in 2.6.24.x ] Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1a20318c8cff..de4e6f05840b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -103,6 +103,7 @@ static void __init xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; + BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; if (!have_vcpu_info_placement) @@ -805,33 +806,43 @@ static __init void xen_pagetable_setup_start(pgd_t *base) PFN_DOWN(__pa(xen_start_info->pt_base))); } -static __init void xen_pagetable_setup_done(pgd_t *base) +static __init void setup_shared_info(void) { - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pt = xen_alloc_pt; - pv_mmu_ops.alloc_pd = xen_alloc_pd; - pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pt; - pv_mmu_ops.set_pte = xen_set_pte; - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); + /* * Create a mapping for the shared info page. * Should be set_fixmap(), but shared_info is a machine * address with no corresponding pseudo-phys address. */ - set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + set_pte_mfn(addr, PFN_DOWN(xen_start_info->shared_info), PAGE_KERNEL); - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - + HYPERVISOR_shared_info = (struct shared_info *)addr; } else HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); +#ifndef CONFIG_SMP + /* In UP this is as good a place as any to set up shared info */ + xen_setup_vcpu_info_placement(); +#endif +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pt = xen_alloc_pt; + pv_mmu_ops.alloc_pd = xen_alloc_pd; + pv_mmu_ops.release_pt = xen_release_pt; + pv_mmu_ops.release_pd = xen_release_pt; + pv_mmu_ops.set_pte = xen_set_pte; + + setup_shared_info(); + /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ { @@ -1182,15 +1193,9 @@ asmlinkage void __init xen_start_kernel(void) x86_write_percpu(xen_cr3, __pa(pgd)); x86_write_percpu(xen_current_cr3, __pa(pgd)); -#ifdef CONFIG_SMP /* Don't do the full vcpu_info placement stuff until we have a - possible map. */ + possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; -#else - /* May as well do it now, since there's no good time to call - it later on UP. */ - xen_setup_vcpu_info_placement(); -#endif pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) -- cgit v1.2.3 From 3085354de635179d70c240e6d942bcbd1d93056c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 21:29:09 +0100 Subject: x86: prefetch fix #2 Linus noticed a second bug and an uncleanliness: - we'd return on any instruction fetch fault - we'd use both the value of 16 and the PF_INSTR symbol which are the same and make no sense the cleanup nicely unifies this piece of logic. Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c0c82bc143c9..ec08d8389850 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -91,13 +91,10 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr, int prefetch = 0; unsigned char *max_instr; -#ifdef CONFIG_X86_32 - /* Catch an obscure case of prefetch inside an NX page: */ - if ((__supported_pte_mask & _PAGE_NX) && (error_code & 16)) - return 0; -#endif - - /* If it was a exec fault on NX page, ignore */ + /* + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ if (error_code & PF_INSTR) return 0; -- cgit v1.2.3 From a6bd8e13034dd7d60b6f14217096efa192d0adc1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Mar 2008 11:05:53 -0500 Subject: lguest: comment documentation update. Took some cycles to re-read the Lguest Journey end-to-end, fix some rot and tighten some phrases. Only comments change. No new jokes, but a couple of recycled old jokes. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 108 +++++++++++++++++++++++++------------------- arch/x86/lguest/i386_head.S | 15 ++++-- 2 files changed, 73 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a104c532ff70..3335b4595efd 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -10,21 +10,19 @@ * (such as the example in Documentation/lguest/lguest.c) is called the * Launcher. * - * Secondly, we only run specially modified Guests, not normal kernels. When - * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets - * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows - * how to be a Guest. This means that you can use the same kernel you boot - * normally (ie. as a Host) as a Guest. + * Secondly, we only run specially modified Guests, not normal kernels: setting + * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows + * how to be a Guest at boot time. This means that you can use the same kernel + * you boot normally (ie. as a Host) as a Guest. * * These Guests know that they cannot do privileged operations, such as disable * interrupts, and that they have to ask the Host to do such things explicitly. * This file consists of all the replacements for such low-level native * hardware operations: these special Guest versions call the Host. * - * So how does the kernel know it's a Guest? The Guest starts at a special - * entry point marked with a magic string, which sets up a few things then - * calls here. We replace the native functions various "paravirt" structures - * with our Guest versions, then boot like normal. :*/ + * So how does the kernel know it's a Guest? We'll see that later, but let's + * just say that we end up here where we replace the native functions various + * "paravirt" structures with our Guest versions, then boot like normal. :*/ /* * Copyright (C) 2006, Rusty Russell IBM Corporation. @@ -134,7 +132,7 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing. */ + * future processing: */ static void lazy_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, @@ -147,7 +145,7 @@ static void lazy_hcall(unsigned long call, } /* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue a hypercall to flush any stored calls. */ + * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mode(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); @@ -164,7 +162,7 @@ static void lguest_leave_lazy_mode(void) * * So instead we keep an "irq_enabled" field inside our "struct lguest_data", * which the Guest can update with a single instruction. The Host knows to - * check there when it wants to deliver an interrupt. + * check there before it tries to deliver an interrupt. */ /* save_flags() is expected to return the processor state (ie. "flags"). The @@ -196,10 +194,15 @@ static void irq_enable(void) /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way + * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way * would be to put the "irq_enabled" field in a page by itself, and have the * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. :*/ + * There will then be a page fault as soon as interrupts are re-enabled. + * + * A better method is to implement soft interrupt disable generally for x86: + * instead of disabling interrupts, we set a flag. If an interrupt does come + * in, we then disable them for real. This is uncommon, so we could simply use + * a hypercall for interrupt control and not worry about efficiency. :*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -212,6 +215,10 @@ static void irq_enable(void) static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { + /* The gate_desc structure is 8 bytes long: we hand it to the Host in + * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors + * around like this; typesafety wasn't a big concern in Linux's early + * years. */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -243,7 +250,8 @@ static void lguest_load_idt(const struct desc_ptr *desc) * * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY * hypercall and use that repeatedly to load a new IDT. I don't think it - * really matters, but wouldn't it be nice if they were the same? + * really matters, but wouldn't it be nice if they were the same? Wouldn't + * it be even better if you were the one to send the patch to fix it? */ static void lguest_load_gdt(const struct desc_ptr *desc) { @@ -298,9 +306,9 @@ static void lguest_load_tr_desc(void) /* The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you - * might imagine, after a decade and a half this treatment, it is now a giant - * ball of hair. Its entry in the current Intel manual runs to 28 pages. + * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. + * As you might imagine, after a decade and a half this treatment, it is now a + * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry * has been translated into 4 languages. I am not making this up! @@ -594,17 +602,17 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, - * or 0 if it's unusable as a reliable clock source. This matches what we want - * here: if we return 0 from this function, the x86 TSC clock will not register - * itself. */ +/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us + * what speed it runs at, or 0 if it's unusable as a reliable clock source. + * This matches what we want here: if we return 0 from this function, the x86 + * TSC clock will give up and not register itself. */ static unsigned long lguest_cpu_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where - * we read the time value given to us by the Host. */ +/* If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. */ static cycle_t lguest_clock_read(void) { unsigned long sec, nsec; @@ -648,12 +656,16 @@ static struct clocksource lguest_clock = { static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { + /* FIXME: I don't think this can ever happen, but James tells me he had + * to put this code in. Maybe we should remove it now. Anyone? */ if (delta < LG_CLOCK_MIN_DELTA) { if (printk_ratelimit()) printk(KERN_DEBUG "%s: small delta %lu ns\n", __FUNCTION__, delta); return -ETIME; } + + /* Please wake us this far in the future. */ hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); return 0; } @@ -738,7 +750,7 @@ static void lguest_time_init(void) * will not tolerate us trying to use that), the stack pointer, and the number * of pages in the stack. */ static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, THREAD_SIZE/PAGE_SIZE); @@ -786,9 +798,8 @@ static void lguest_safe_halt(void) hcall(LHCALL_HALT, 0, 0, 0); } -/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a - * message out when we're crashing as well as elegant termination like powering - * off. +/* The SHUTDOWN hypercall takes a string to describe what's happening, and + * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses * rather than virtual addresses, so we use __pa() here. */ @@ -816,8 +827,9 @@ static struct notifier_block paniced = { /* Setting up memory is fairly easy. */ static __init char *lguest_memory_setup(void) { - /* We do this here and not earlier because lockcheck barfs if we do it - * before start_kernel() */ + /* We do this here and not earlier because lockcheck used to barf if we + * did it before start_kernel(). I think we fixed that, so it'd be + * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); /* The Linux bootloader header contains an "e820" memory map: the @@ -850,12 +862,19 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } +/* Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. */ +static void lguest_restart(char *reason) +{ + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); +} + /*G:050 * Patching (Powerfully Placating Performance Pedants) * - * We have already seen that pv_ops structures let us replace simple - * native instructions with calls to the appropriate back end all throughout - * the kernel. This allows the same kernel to run as a Guest and as a native + * We have already seen that pv_ops structures let us replace simple native + * instructions with calls to the appropriate back end all throughout the + * kernel. This allows the same kernel to run as a Guest and as a native * kernel, but it's slow because of all the indirect branches. * * Remember that David Wheeler quote about "Any problem in computer science can @@ -908,14 +927,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); -} - -/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops - * structures in the kernel provide points for (almost) every routine we have - * to override to avoid privileged instructions. */ +/*G:030 Once we get to lguest_init(), we know we're a Guest. The various + * pv_ops structures in the kernel provide points for (almost) every routine we + * have to override to avoid privileged instructions. */ __init void lguest_init(void) { /* We're under lguest, paravirt is enabled, and we're running at @@ -1003,9 +1017,9 @@ __init void lguest_init(void) * the normal data segment to get through booting. */ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); - /* The Host uses the top of the Guest's virtual address space for the - * Host<->Guest Switcher, and it tells us how big that is in - * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ + /* The Host<->Guest Switcher lives at the top of our address space, and + * the Host told us how big it is when we made LGUEST_INIT hypercall: + * it put the answer in lguest_data.reserve_mem */ reserve_top_address(lguest_data.reserve_mem); /* If we don't initialize the lock dependency checker now, it crashes @@ -1027,6 +1041,7 @@ __init void lguest_init(void) /* Math is always hard! */ new_cpu_data.hard_math = 1; + /* We don't have features. We have puppies! Puppies! */ #ifdef CONFIG_X86_MCE mce_disabled = 1; #endif @@ -1044,10 +1059,11 @@ __init void lguest_init(void) virtio_cons_early_init(early_put_chars); /* Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off. */ + * the Guest routine to power off, and the reboot hook to our restart + * routine. */ pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; + /* Now we're set up, call start_kernel() in init/main.c and we proceed * to boot as normal. It never returns. */ start_kernel(); diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 95b6fbcded63..5c7cef34c9e7 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,13 +5,20 @@ #include #include -/*G:020 This is where we begin: head.S notes that the boot header's platform - * type field is "1" (lguest), so calls us here. +/*G:020 Our story starts with the kernel booting into startup_32 in + * arch/x86/kernel/head_32.S. It expects a boot header, which is created by + * the bootloader (the Launcher in our case). + * + * The startup_32 function does very little: it clears the uninitialized global + * C variables which we expect to be zero (ie. BSS) and then copies the boot + * header and kernel command line somewhere safe. Finally it checks the + * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: + * if it's set to '1' (lguest's assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical * addesses (around 0), not above PAGE_OFFSET as most code expectes * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data. + * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after * boot. */ @@ -24,7 +31,7 @@ ENTRY(lguest_entry) int $LGUEST_TRAP_ENTRY /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl - * instruction uses %esi implicitly as the source for the copy we' + * instruction uses %esi implicitly as the source for the copy we're * about to do. */ movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi -- cgit v1.2.3 From 9c312058b2e530722c7bd30c1b6f26eea35dc5fe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 28 Mar 2008 11:47:34 -0700 Subject: Avoid false positive warnings in kmap_atomic_prot() with DEBUG_HIGHMEM I believe http://bugzilla.kernel.org/show_bug.cgi?id=10318 is a false positive. There's no way in which networking will be using highmem pages here, so it won't be taking the KM_USER0 kmap slot, so there's no point in performing these checks. Cc: Pawel Staszewski Cc: Ingo Molnar Acked-by: Christoph Lameter Cc: "David S. Miller" Signed-off-by: Andrew Morton [ Really sad. We lose almost all real-life coverage of the debug tests with this patch. Now it will only report problems for the cases where people actually end up using a HIGHMEM page, not when they just _might_ use one. - Linus ] Signed-off-by: Linus Torvalds --- arch/x86/mm/highmem_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 3d936f232704..9cf33d3ee5bc 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -73,15 +73,15 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - - debug_kmap_atomic_prot(type); + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); if (!PageHighMem(page)) return page_address(page); + debug_kmap_atomic_prot(type); + idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); -- cgit v1.2.3 From 629c8b4cdb354518308663aff2f719e02f69ffbe Mon Sep 17 00:00:00 2001 From: Ken'ichi Ohmichi Date: Wed, 2 Apr 2008 13:04:50 -0700 Subject: vmcoreinfo: add the symbol "phys_base" Fix the problem that makedumpfile sometimes fails on x86_64 machine. This patch adds the symbol "phys_base" to a vmcoreinfo data. The vmcoreinfo data has the minimum debugging information only for dump filtering. makedumpfile (dump filtering command) gets it to distinguish unnecessary pages, and makedumpfile creates a small dumpfile. On x86_64 kernel which compiled with CONFIG_PHYSICAL_START=0x0 and CONFIG_RELOCATABLE=y, makedumpfile fails like the following: # makedumpfile -d31 /proc/vmcore dumpfile The kernel version is not supported. The created dumpfile may be incomplete. _exclude_free_page: Can't get next online node. makedumpfile Failed. # The cause is the lack of the symbol "phys_base" in a vmcoreinfo data. If the symbol "phys_base" does not exist, makedumpfile considers an x86_64 kernel as non relocatable. As the result, makedumpfile misunderstands the physical address where the kernel is loaded, and it cannot translate a kernel virtual address to physical address correctly. To fix this problem, this patch adds the symbol "phys_base" to a vmcoreinfo data. Signed-off-by: Ken'ichi Ohmichi Cc: "Eric W. Biederman" Cc: Acked-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 236d2f8f7ddc..576a03db4511 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -233,6 +233,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { + VMCOREINFO_SYMBOL(phys_base); VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA -- cgit v1.2.3 From 4ba51fd75cc3789be83f0d6f878dabbb0cb19bca Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 3 Apr 2008 14:18:55 -0700 Subject: x86 ptrace: avoid unnecessary wrmsr This avoids using wrmsr on MSR_IA32_DEBUGCTLMSR when it's not needed. No wrmsr ever needs to be done if noone has ever used block stepping. Without this change, using ptrace on 2.6.25 on an x86 KVM guest will tickle KVM's missing support for the MSR and crash the guest kernel. Though host KVM is the buggy one, this makes for a regression in the guest behavior from 2.6.24->2.6.25 that we can easily avoid. I also corrected some bad whitespace. Signed-off-by: Roland McGrath Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- arch/x86/kernel/step.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 9d406cdc847f..071ff4798236 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -140,6 +140,9 @@ static int enable_single_step(struct task_struct *child) */ static void write_debugctlmsr(struct task_struct *child, unsigned long val) { + if (child->thread.debugctlmsr == val) + return; + child->thread.debugctlmsr = val; if (child != current) @@ -165,11 +168,11 @@ static void enable_step(struct task_struct *child, bool block) write_debugctlmsr(child, child->thread.debugctlmsr | DEBUGCTLMSR_BTF); } else { - write_debugctlmsr(child, - child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); + write_debugctlmsr(child, + child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); } } -- cgit v1.2.3 From 4f14bdef41e599e218d71e3d0abf339d65e9b480 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 23:37:58 +0100 Subject: x86: fix nmi_watchdog=2 on Pentium-D CPUs implement nmi_watchdog=2 on this class of CPUs: cpu family : 15 model : 6 model name : Intel(R) Pentium(R) D CPU 3.00GHz the watchdog's ->setup() method is safe anyway, so if the CPU cannot support it we'll bail out safely. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9b838324b818..19a359472ae1 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -652,9 +652,6 @@ static void probe_nmi_watchdog(void) wd_ops = &p6_wd_ops; break; case 15: - if (boot_cpu_data.x86_model > 0x4) - return; - wd_ops = &p4_wd_ops; break; default: -- cgit v1.2.3 From 9c9b81f77330ddc003a2de2f35fa6a20410c1a62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 23:39:42 +0100 Subject: x86: print message if nmi_watchdog=2 cannot be enabled right now if there's no CPU support for nmi_watchdog=2 we'll just refuse it silently. print a useful warning. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 19a359472ae1..b943e10ad814 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -667,8 +667,10 @@ int lapic_watchdog_init(unsigned nmi_hz) { if (!wd_ops) { probe_nmi_watchdog(); - if (!wd_ops) + if (!wd_ops) { + printk(KERN_INFO "NMI watchdog: CPU not supported\n"); return -1; + } if (!wd_ops->reserve()) { printk(KERN_ERR -- cgit v1.2.3 From 8f59610de2fb244b5bc1a3feafd328a8d4d511d6 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 1 Apr 2008 14:24:03 +0200 Subject: x86, agpgart: scary messages are fortunately obsolete Fix obsolete printks in aperture-64. We used not to handle missing agpgart, but we handle it okay now. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index faf3229f8fb3..700e4647dd30 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -615,8 +615,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); + printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + KERN_WARNING "falling back to iommu=soft.\n"); return -1; } @@ -692,9 +692,9 @@ void __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (end_pfn > MAX_DMA32_PFN) { - printk(KERN_ERR "WARNING more than 4GB of memory " - "but GART IOMMU not available.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n"); + printk(KERN_WARNING "More than 4GB of memory " + "but GART IOMMU not available.\n" + KERN_WARNING "falling back to iommu=soft.\n"); } return; } -- cgit v1.2.3 From f64337062c09c2c318fbcbf44ed1d739e8bc72ab Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 2 Apr 2008 15:36:36 +0100 Subject: xen: refactor xen_{alloc,release}_{pt,pd}() Signed-off-by: Mark McLoughlin Cc: xen-devel@lists.xensource.com Cc: Mark McLoughlin Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 27 ++++++++++++++++++++------- arch/x86/xen/mmu.c | 7 ------- arch/x86/xen/mmu.h | 7 +++++++ 3 files changed, 27 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index de4e6f05840b..16e2f8096a1a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -667,10 +667,10 @@ static void xen_release_pt_init(u32 pfn) make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -static void pin_pagetable_pfn(unsigned level, unsigned long pfn) +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) { struct mmuext_op op; - op.cmd = level; + op.cmd = cmd; op.arg1.mfn = pfn_to_mfn(pfn); if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) BUG(); @@ -687,7 +687,10 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - pin_pagetable_pfn(level, pfn); + if (level == PT_PTE) + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + else if (level == PT_PMD) + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, pfn); } else /* make sure there are no stray mappings of this page */ @@ -697,16 +700,16 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) { - xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE); + xen_alloc_ptpage(mm, pfn, PT_PTE); } static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) { - xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE); + xen_alloc_ptpage(mm, pfn, PT_PMD); } /* This should never happen until we're OK to use struct page */ -static void xen_release_pt(u32 pfn) +static void xen_release_ptpage(u32 pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -718,6 +721,16 @@ static void xen_release_pt(u32 pfn) } } +static void xen_release_pt(u32 pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pd(u32 pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + #ifdef CONFIG_HIGHPTE static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -838,7 +851,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.alloc_pt = xen_alloc_pt; pv_mmu_ops.alloc_pd = xen_alloc_pd; pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pt; + pv_mmu_ops.release_pd = xen_release_pd; pv_mmu_ops.set_pte = xen_set_pte; setup_shared_info(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0144395448ae..2a054ef2a3da 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -310,13 +310,6 @@ pgd_t xen_make_pgd(unsigned long pgd) } #endif /* CONFIG_X86_PAE */ -enum pt_level { - PT_PGD, - PT_PUD, - PT_PMD, - PT_PTE -}; - /* (Yet another) pagetable walker. This one is intended for pinning a pagetable. This means that it walks a pagetable and calls the diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index c9ff27f3ac3a..b5e189b1519d 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -3,6 +3,13 @@ #include #include +enum pt_level { + PT_PGD, + PT_PUD, + PT_PMD, + PT_PTE +}; + /* * Page-directory addresses above 4GB do not fit into architectural %cr3. * When accessing %cr3, or equivalent field in vcpu_guest_context, guests -- cgit v1.2.3 From a684d69d15a8fafede7c5c0daac8c646bbee805c Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 2 Apr 2008 15:36:37 +0100 Subject: xen: Do not pin/unpin PMD pages i.e. with this simple test case: int fd = open("/dev/zero", O_RDONLY); munmap(mmap((void *)0x40000000, 0x1000_LEN, PROT_READ, MAP_PRIVATE, fd, 0), 0x1000); close(fd); we currently get: kernel BUG at arch/x86/xen/enlighten.c:678! ... EIP is at xen_release_pt+0x79/0xa9 ... Call Trace: [] ? __pmd_free_tlb+0x1a/0x75 [] ? free_pgd_range+0x1d2/0x2b5 [] ? free_pgtables+0x7e/0x93 [] ? unmap_region+0xb9/0xf5 [] ? do_munmap+0x193/0x1f5 [] ? sys_munmap+0x30/0x3f [] ? syscall_call+0x7/0xb ======================= and xen complains: (XEN) mm.c:2241:d4 Mfn 1cc37 not pinned Further details at: https://bugzilla.redhat.com/436453 Signed-off-by: Mark McLoughlin Cc: xen-devel@lists.xensource.com Cc: Mark McLoughlin Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 16e2f8096a1a..f16b056e5c56 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -689,8 +689,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); if (level == PT_PTE) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - else if (level == PT_PMD) - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, pfn); } else /* make sure there are no stray mappings of this page */ @@ -715,7 +713,8 @@ static void xen_release_ptpage(u32 pfn, unsigned level) if (PagePinned(page)) { if (!PageHighMem(page)) { - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + if (level == PT_PTE) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } } -- cgit v1.2.3 From c946c7de49a9ba50bc205d6359b41bbc8f01174c Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 2 Apr 2008 15:36:38 +0100 Subject: xen: Clear PG_pinned in release_{pt,pd}() Signed-off-by: Mark McLoughlin Cc: xen-devel@lists.xensource.com Cc: Mark McLoughlin Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f16b056e5c56..27ee26aedf94 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -717,6 +717,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } + ClearPagePinned(page); } } -- cgit v1.2.3 From 47001d603375f857a7fab0e9c095d964a1ea0039 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Apr 2008 19:45:18 +0200 Subject: x86: tsc prevent time going backwards We already catch most of the TSC problems by sanity checks, but there is a subtle bug which has been in the code for ever. This can cause time jumps in the range of hours. This was reported in: http://lkml.org/lkml/2007/8/23/96 and http://lkml.org/lkml/2008/3/31/23 I was able to reproduce the problem with a gettimeofday loop test on a dual core and a quad core machine which both have sychronized TSCs. The TSCs seems not to be perfectly in sync though, but the kernel is not able to detect the slight delta in the sync check. Still there exists an extremly small window where this delta can be observed with a real big time jump. So far I was only able to reproduce this with the vsyscall gettimeofday implementation, but in theory this might be observable with the syscall based version as well. CPU 0 updates the clock source variables under xtime/vyscall lock and CPU1, where the TSC is slighty behind CPU0, is reading the time right after the seqlock was unlocked. The clocksource reference data was updated with the TSC from CPU0 and the value which is read from TSC on CPU1 is less than the reference data. This results in a huge delta value due to the unsigned subtraction of the TSC value and the reference value. This algorithm can not be changed due to the support of wrapping clock sources like pm timer. The huge delta is converted to nanoseconds and added to xtime, which is then observable by the caller. The next gettimeofday call on CPU1 will show the correct time again as now the TSC has advanced above the reference value. To prevent this TSC specific wreckage we need to compare the TSC value against the reference value and return the latter when it is larger than the actual TSC value. I pondered to mark the TSC unstable when the readout is smaller than the reference value, but this would render an otherwise good and fast clocksource unusable without a real good reason. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 15 ++++++++++++++- arch/x86/kernel/tsc_64.c | 23 ++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index f14cfd9d1f94..d7498b34c8e9 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -287,14 +287,27 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; +static struct clocksource clocksource_tsc; +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp issue. This can be observed in + * a very small window right after one CPU updated cycle_last under + * xtime lock and the other CPU reads a TSC value which is smaller + * than the cycle_last reference value due to a TSC which is slighty + * behind. This delta is nowhere else observable, but in that case it + * results in a forward time jump in the range of hours due to the + * unsigned delta calculation of the time keeping core code, which is + * necessary to support wrapping clocksources like pm timer. + */ static cycle_t read_tsc(void) { cycle_t ret; rdtscll(ret); - return ret; + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; } static struct clocksource clocksource_tsc = { diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 947554ddabb6..01fc9f0c39e2 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -11,6 +11,7 @@ #include #include #include +#include static int notsc __initdata = 0; @@ -290,18 +291,34 @@ int __init notsc_setup(char *s) __setup("notsc", notsc_setup); +static struct clocksource clocksource_tsc; -/* clock source code: */ +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slighty behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + */ static cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles(); - return ret; + + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; } static cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)vget_cycles(); - return ret; + + return ret >= __vsyscall_gtod_data.clock.cycle_last ? + ret : __vsyscall_gtod_data.clock.cycle_last; } static struct clocksource clocksource_tsc = { -- cgit v1.2.3 From 5761d64b277c287a7520b868c32d656ef03374b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Apr 2008 16:26:10 +0200 Subject: x86: revert assign IRQs to hpet timer The commits: commit 37a47db8d7f0f38dac5acf5a13abbc8f401707fa Author: Balaji Rao Date: Wed Jan 30 13:30:03 2008 +0100 x86: assign IRQs to HPET timers, fix and commit e3f37a54f690d3e64995ea7ecea08c5ab3070faf Author: Balaji Rao Date: Wed Jan 30 13:30:03 2008 +0100 x86: assign IRQs to HPET timers have been identified to cause a regression on some platforms due to the assignement of legacy IRQs which makes the legacy devices connected to those IRQs disfunctional. Revert them. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=10382 Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 235fd6c77504..36652ea1a265 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -133,13 +133,16 @@ static void hpet_reserve_platform_timers(unsigned long id) #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif + hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + hpet_alloc(&hd); + } #else static void hpet_reserve_platform_timers(unsigned long id) { } -- cgit v1.2.3 From 64ba4f230d30b089bc89db2e59d02c1efa5ac769 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 6 Apr 2008 17:23:38 +1000 Subject: Fix booting pentium+ with dodgy TSC We handle a broken tsc these days, so no need to panic. We clear the TSC bit when tsc_init decides it's unreliable (eg. under lguest w/ bad host TSC), leading to bogus panic. Signed-off-by: Rusty Russell Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/bugs.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 027e5c003b16..170d2f5523b2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -142,14 +142,6 @@ static void __init check_config(void) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif -/* - * If we configured ourselves for a TSC, we'd better have one! - */ -#ifdef CONFIG_X86_TSC - if (!cpu_has_tsc) - panic("Kernel compiled for Pentium+, requires TSC feature!"); -#endif - /* * If we were told we had a good local APIC, check for buggy Pentia, * i.e. all B steppings and the C2 stepping of P54C when using their -- cgit v1.2.3 From 5b13d863573e746739ccfc24ac1a9473cfee8df1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 7 Apr 2008 20:58:08 +0200 Subject: revert "x86: tsc prevent time going backwards" revert: | commit 47001d603375f857a7fab0e9c095d964a1ea0039 | Author: Thomas Gleixner | Date: Tue Apr 1 19:45:18 2008 +0200 | | x86: tsc prevent time going backwards it has been identified to cause suspend regression - and the commit fixes a longstanding bug that existed before 2.6.25 was opened - so it can wait some more until the effects are better understood. Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 15 +-------------- arch/x86/kernel/tsc_64.c | 23 +++-------------------- 2 files changed, 4 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index d7498b34c8e9..f14cfd9d1f94 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -287,27 +287,14 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; -static struct clocksource clocksource_tsc; -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp issue. This can be observed in - * a very small window right after one CPU updated cycle_last under - * xtime lock and the other CPU reads a TSC value which is smaller - * than the cycle_last reference value due to a TSC which is slighty - * behind. This delta is nowhere else observable, but in that case it - * results in a forward time jump in the range of hours due to the - * unsigned delta calculation of the time keeping core code, which is - * necessary to support wrapping clocksources like pm timer. - */ static cycle_t read_tsc(void) { cycle_t ret; rdtscll(ret); - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; + return ret; } static struct clocksource clocksource_tsc = { diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 01fc9f0c39e2..947554ddabb6 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -11,7 +11,6 @@ #include #include #include -#include static int notsc __initdata = 0; @@ -291,34 +290,18 @@ int __init notsc_setup(char *s) __setup("notsc", notsc_setup); -static struct clocksource clocksource_tsc; -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp. This can be observed in a - * very small window right after one CPU updated cycle_last under - * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which - * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in - * that case it results in a forward time jump in the range of hours - * due to the unsigned delta calculation of the time keeping core - * code, which is necessary to support wrapping clocksources like pm - * timer. - */ +/* clock source code: */ static cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; + return ret; } static cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)vget_cycles(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; + return ret; } static struct clocksource clocksource_tsc = { -- cgit v1.2.3 From 4f41c94d5c24e3b3453e9df03c0a80ca1acf00d2 Mon Sep 17 00:00:00 2001 From: Karsten Wiese Date: Mon, 7 Apr 2008 12:14:45 +0200 Subject: x86: fix call to set_cyc2ns_scale() from time_cpufreq_notifier() In time_cpufreq_notifier() the cpu id to act upon is held in freq->cpu. Use it instead of smp_processor_id() in the call to set_cyc2ns_scale(). This makes the preempt_*able() unnecessary and lets set_cyc2ns_scale() update the intended cpu's cyc2ns. Related mail/thread: http://lkml.org/lkml/2007/12/7/130 Signed-off-by: Karsten Wiese Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 4 +--- arch/x86/kernel/tsc_64.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index f14cfd9d1f94..c2241e04ea5f 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -256,9 +256,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { tsc_khz = cpu_khz; - preempt_disable(); - set_cyc2ns_scale(cpu_khz, smp_processor_id()); - preempt_enable(); + set_cyc2ns_scale(cpu_khz, freq->cpu); /* * TSC based sched_clock turns * to junk w/ cpufreq diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 947554ddabb6..d3bebaaad842 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -148,9 +148,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - preempt_disable(); - set_cyc2ns_scale(tsc_khz_ref, smp_processor_id()); - preempt_enable(); + set_cyc2ns_scale(tsc_khz_ref, freq->cpu); return 0; } -- cgit v1.2.3 From f4be31ec9690cfe6e94fcbed6ae60a6a38b3c3ed Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Apr 2008 19:04:07 -0400 Subject: pop previous section in alternative.c gcc expects all toplevel assembly to return to the original section type. The code in alteranative.c does not do this. This caused some strange bugs in sched-devel where code would end up in the .rodata section and when the kernel sets the NX bit on all .rodata, the kernel would crash when executing this code. This patch adds a .previous marker to return the code back to the original section. Credit goes to Andrew Pinski for telling me it wasn't a gcc bug but a bug in the toplevel asm code in the kernel. ;-) Signed-off-by: Steven Rostedt Signed-off-by: Linus Torvalds --- arch/x86/kernel/alternative.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 45d79ea890ae..5fed98ca0e1f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -65,7 +65,8 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); get them easily into strings. */ asm("\t.section .rodata, \"a\"\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8); + GENERIC_NOP7 GENERIC_NOP8 + "\t.previous"); extern const unsigned char intelnops[]; static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { NULL, @@ -83,7 +84,8 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { #ifdef K8_NOP1 asm("\t.section .rodata, \"a\"\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); + K8_NOP7 K8_NOP8 + "\t.previous"); extern const unsigned char k8nops[]; static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { NULL, @@ -101,7 +103,8 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { #ifdef K7_NOP1 asm("\t.section .rodata, \"a\"\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); + K7_NOP7 K7_NOP8 + "\t.previous"); extern const unsigned char k7nops[]; static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { NULL, @@ -119,7 +122,8 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { #ifdef P6_NOP1 asm("\t.section .rodata, \"a\"\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8); + P6_NOP7 P6_NOP8 + "\t.previous"); extern const unsigned char p6nops[]; static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { NULL, -- cgit v1.2.3 From 783e391b7b5b273cd20856d8f6f4878da8ec31b3 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Thu, 10 Apr 2008 09:49:58 -0700 Subject: x86: Simplify cpu_idle_wait This patch also resolves hangs on boot: http://lkml.org/lkml/2008/2/23/263 http://bugzilla.kernel.org/show_bug.cgi?id=10093 The bug was causing once-in-few-reboots 10-15 sec wait during boot on certain laptops. Earlier commit 40d6a146629b98d8e322b6f9332b182c7cbff3df added smp_call_function in cpu_idle_wait() to kick cpus that are in tickless idle. Looking at cpu_idle_wait code at that time, code seemed to be over-engineered for a case which is rarely used (while changing idle handler). Below is a simplified version of cpu_idle_wait, which just makes a dummy smp_call_function to all cpus, to make them come out of old idle handler and start using the new idle handler. It eliminates code in the idle loop to handle cpu_idle_wait. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Linus Torvalds --- arch/x86/kernel/process_32.c | 47 +++++++++++--------------------------------- arch/x86/kernel/process_64.c | 47 +++++++++++--------------------------------- 2 files changed, 22 insertions(+), 72 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index be3c7a299f02..43930e73f657 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -82,7 +82,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -190,9 +189,6 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - check_pgt_cache(); rmb(); idle = pm_idle; @@ -220,40 +216,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, NULL, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3baf9b9f4c87..46c4c546b499 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -63,7 +63,6 @@ EXPORT_SYMBOL(boot_option_idle_override); */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -173,9 +172,6 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - rmb(); idle = pm_idle; if (!idle) @@ -207,40 +203,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); -- cgit v1.2.3 From 54a015104136974262afa4b8ddd943ea70dec8a2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 10 Apr 2008 15:37:38 -0700 Subject: asmlinkage_protect replaces prevent_tail_call The prevent_tail_call() macro works around the problem of the compiler clobbering argument words on the stack, which for asmlinkage functions is the caller's (user's) struct pt_regs. The tail/sibling-call optimization is not the only way that the compiler can decide to use stack argument words as scratch space, which we have to prevent. Other optimizations can do it too. Until we have new compiler support to make "asmlinkage" binding on the compiler's own use of the stack argument frame, we have work around all the manifestations of this issue that crop up. More cases seem to be prevented by also keeping the incoming argument variables live at the end of the function. This makes their original stack slots attractive places to leave those variables, so the compiler tends not clobber them for something else. It's still no guarantee, but it handles some observed cases that prevent_tail_call() did not. Signed-off-by: Roland McGrath Signed-off-by: Linus Torvalds --- arch/x86/kernel/tls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 022bcaa3b42e..ab6bf375a307 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,7 +92,7 @@ int do_set_thread_area(struct task_struct *p, int idx, asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) { int ret = do_set_thread_area(current, -1, u_info, 1); - prevent_tail_call(ret); + asmlinkage_protect(1, ret, u_info); return ret; } @@ -142,7 +142,7 @@ int do_get_thread_area(struct task_struct *p, int idx, asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) { int ret = do_get_thread_area(current, -1, u_info); - prevent_tail_call(ret); + asmlinkage_protect(1, ret, u_info); return ret; } -- cgit v1.2.3