From fd4fd5aac1282825195c6816ed40a2a6d42db5bf Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sat, 3 Sep 2005 15:54:30 -0700
Subject: [PATCH] mm: consolidate get_order

Someone mentioned that almost all the architectures used basically the same
implementation of get_order.  This patch consolidates them into
asm-generic/page.h and includes that in the appropriate places.  The
exceptions are ia64 and ppc which have their own (presumably optimised)
versions.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/page.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 8d93f732d72d..10045fd82103 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -104,20 +104,6 @@ typedef struct { unsigned long pgprot; } pgprot_t;
  */
 extern unsigned int __VMALLOC_RESERVE;
 
-/* Pure 2^n version of get_order */
-static __inline__ int get_order(unsigned long size)
-{
-	int order;
-
-	size = (size-1) >> (PAGE_SHIFT-1);
-	order = -1;
-	do {
-		size >>= 1;
-		order++;
-	} while (size);
-	return order;
-}
-
 extern int sysctl_legacy_va_layout;
 
 extern int page_is_ram(unsigned long pagenr);
@@ -156,4 +142,6 @@ extern int page_is_ram(unsigned long pagenr);
 
 #endif /* __KERNEL__ */
 
+#include <asm-generic/page.h>
+
 #endif /* _I386_PAGE_H */
-- 
cgit v1.2.3


From 9b4ee40ebbbaf3f8c775b023d89ceedda1167d79 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 3 Sep 2005 15:54:57 -0700
Subject: [PATCH] mm: correct _PAGE_FILE comment

_PAGE_FILE does not indicate whether a file is in page / swap cache, it is
set just for non-linear PTE's.  Correct the comment for i386, x86_64, UML.
Also clearify _PAGE_NONE.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 77c6497f416e..c797286b512f 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -86,9 +86,7 @@ void paging_init(void);
 #endif
 
 /*
- * The 4MB page is guessing..  Detailed in the infamous "Chapter H"
- * of the Pentium details, but assuming intel did the straightforward
- * thing, this bit set in the page directory entry just means that
+ * _PAGE_PSE set in the page directory entry just means that
  * the page directory entry points directly to a 4MB-aligned block of
  * memory. 
  */
@@ -119,8 +117,10 @@ void paging_init(void);
 #define _PAGE_UNUSED2	0x400
 #define _PAGE_UNUSED3	0x800
 
-#define _PAGE_FILE	0x040	/* set:pagecache unset:swap */
-#define _PAGE_PROTNONE	0x080	/* If not present */
+/* If _PAGE_PRESENT is clear, we use these: */
+#define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_PROTNONE	0x080	/* if the user mapped it with PROT_NONE;
+				   pte_present gives true */
 #ifdef CONFIG_X86_PAE
 #define _PAGE_NX	(1ULL<<_PAGE_BIT_NX)
 #else
-- 
cgit v1.2.3


From 32e51a8c976fc72c3e9bcece9767d9908816bf8e Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Sat, 3 Sep 2005 15:54:59 -0700
Subject: [PATCH] hugetlb: add pte_huge() macro

This patch adds a macro pte_huge(pte) for i386/x86_64 which is needed by a
patch later in the series.  Instead of repeating (_PAGE_PRESENT |
_PAGE_PSE), I've added __LARGE_PTE to i386 to match x86_64.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index c797286b512f..f51fd2c956bb 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -215,11 +215,13 @@ extern unsigned long pg0[];
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
+#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
 static inline int pte_user(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_read(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_dirty(pte_t pte)		{ return (pte).pte_low & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return (pte).pte_low & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return (pte).pte_low & _PAGE_RW; }
+static inline int pte_huge(pte_t pte)		{ return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
 
 /*
  * The following only works if pte_present() is not true.
@@ -236,7 +238,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ (pte).pte_low |= _PAGE_USER; return
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PRESENT | _PAGE_PSE; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= __LARGE_PTE; return pte; }
 
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
-- 
cgit v1.2.3


From 0e5c9f39f64d8a55c5db37a5ea43e37d3422fd92 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Sat, 3 Sep 2005 15:55:02 -0700
Subject: [PATCH] remove hugetlb_clean_stale_pgtable() and fix huge_pte_alloc()

I don't think we need to call hugetlb_clean_stale_pgtable() anymore
in 2.6.13 because of the rework with free_pgtables().  It now collect
all the pte page at the time of munmap.  It used to only collect page
table pages when entire one pgd can be freed and left with staled pte
pages.  Not anymore with 2.6.13.  This function will never be called
and We should turn it into a BUG_ON.

I also spotted two problems here, not Adam's fault :-)
(1) in huge_pte_alloc(), it looks like a bug to me that pud is not
    checked before calling pmd_alloc()
(2) in hugetlb_clean_stale_pgtable(), it also missed a call to
    pmd_free_tlb.  I think a tlb flush is required to flush the mapping
    for the page table itself when we clear out the pmd pointing to a
    pte page.  However, since hugetlb_clean_stale_pgtable() is never
    called, so it won't trigger the bug.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/page.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 10045fd82103..73296d9924fb 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -68,7 +68,6 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
 #endif
 
 #define pgd_val(x)	((x).pgd)
-- 
cgit v1.2.3


From fa5b08d5f818063d18433194f20359ef2ae50254 Mon Sep 17 00:00:00 2001
From: Kyle Moffett <mrmacman_g4@mac.com>
Date: Sat, 3 Sep 2005 15:55:03 -0700
Subject: [PATCH] sab: consolidate kmem_bufctl_t

This is used only in slab.c and each architecture gets to define whcih
underlying type is to be used.

Seems a bit silly - move it to slab.c and use the same type for all
architectures: unsigned int.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/types.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/types.h b/include/asm-i386/types.h
index 901b77c42b8a..ced00fe8fe61 100644
--- a/include/asm-i386/types.h
+++ b/include/asm-i386/types.h
@@ -63,8 +63,6 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
-typedef unsigned short kmem_bufctl_t;
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From a600388d28419305aad3c4c0af52c223cf6fa0af Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:55:04 -0700
Subject: [PATCH] x86: ptep_clear optimization

Add a new accessor for PTEs, which passes the full hint from the mmu_gather
struct; this allows architectures with hardware pagetables to optimize away
atomic PTE operations when destroying an address space.  Removing the
locked operation should allow better pipelining of memory access in this
loop.  I measured an average savings of 30-35 cycles per zap_pte_range on
the first 500 destructions on Pentium-M, but I believe the optimization
would win more on older processors which still assert the bus lock on xchg
for an exclusive cacheline.

Update: I made some new measurements, and this saves exactly 26 cycles over
ptep_get_and_clear on Pentium M.  On P4, with a PAE kernel, this saves 180
cycles per ptep_get_and_clear, for a whopping 92160 cycles savings for a
full address space destruction.

pte_clear_full is not yet used, but is provided for future optimizations
(in particular, when running inside of a hypervisor that queues page table
updates, the full hint allows us to avoid queueing unnecessary page table
update for an address space in the process of being destroyed.

This is not a huge win, but it does help a bit, and sets the stage for
further hypervisor optimization of the mm layer on all architectures.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index f51fd2c956bb..d74185aee15b 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -260,6 +260,18 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned
 	return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
 }
 
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
+{
+	pte_t pte;
+	if (full) {
+		pte = *ptep;
+		*ptep = __pte(0);
+	} else {
+		pte = ptep_get_and_clear(mm, addr, ptep);
+	}
+	return pte;
+}
+
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
@@ -417,6 +429,7 @@ extern void noexec_setup(const char *str);
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
 #include <asm-generic/pgtable.h>
-- 
cgit v1.2.3


From 7ae65fd334232468a9d6b523a4fc141cd6ec5ea4 Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Sat, 3 Sep 2005 15:56:27 -0700
Subject: [PATCH] x86: fix EFI memory map parsing

The memory descriptors that comprise the EFI memory map are not fixed in
stone such that the size could change in the future.  This uses the memory
descriptor size obtained from EFI to iterate over the memory map entries
during boot.  This enables the removal of an x86 specific pad (and ifdef)
in the EFI header.  I also couldn't stomach the broken up nature of the
function to put EFI runtime calls into virtual mode any longer so I fixed
that up a bit as well.

For reference, this patch only impacts x86.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/setup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 7a32184d54bf..826a8ca50ac8 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -44,7 +44,7 @@ extern unsigned char boot_params[PARAM_SIZE];
 #define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
 #define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
 #define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
-#define EFI_MEMMAP ((efi_memory_desc_t *) *((unsigned long *)(PARAM+0x1d0)))
+#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
 #define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
 #define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
 #define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
-- 
cgit v1.2.3


From 911a62d42365076209e2c327e7688db296e35d62 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 3 Sep 2005 15:56:31 -0700
Subject: [PATCH] x86: sutomatically enable bigsmp when we have more than 8
 CPUs

i386 generic subarchitecture requires explicit dmi strings or command line
to enable bigsmp mode.  The patch below removes that restriction, and uses
bigsmp as soon as it finds more than 8 logical CPUs, Intel processors and
xAPIC support.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/apicdef.h                | 1 +
 include/asm-i386/mach-generic/mach_apic.h | 2 ++
 include/asm-i386/mpspec.h                 | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h
index a96a8f48fbfc..03185cef8e0a 100644
--- a/include/asm-i386/apicdef.h
+++ b/include/asm-i386/apicdef.h
@@ -16,6 +16,7 @@
 #define			GET_APIC_VERSION(x)	((x)&0xFF)
 #define			GET_APIC_MAXLVT(x)	(((x)>>16)&0xFF)
 #define			APIC_INTEGRATED(x)	((x)&0xF0)
+#define			APIC_XAPIC(x)		((x) >= 0x14)
 #define		APIC_TASKPRI	0x80
 #define			APIC_TPRI_MASK		0xFF
 #define		APIC_ARBPRI	0x90
diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h
index b13767a4e934..d9dc039da94a 100644
--- a/include/asm-i386/mach-generic/mach_apic.h
+++ b/include/asm-i386/mach-generic/mach_apic.h
@@ -28,4 +28,6 @@
 #define enable_apic_mode (genapic->enable_apic_mode)
 #define phys_pkg_id (genapic->phys_pkg_id)
 
+extern void generic_bigsmp_probe(void);
+
 #endif /* __ASM_MACH_APIC_H */
diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h
index d9fafba075bc..d84a9c326c22 100644
--- a/include/asm-i386/mpspec.h
+++ b/include/asm-i386/mpspec.h
@@ -11,6 +11,7 @@ extern int mp_bus_id_to_local [MAX_MP_BUSSES];
 extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
 
+extern unsigned int def_to_bigsmp;
 extern unsigned int boot_cpu_physical_apicid;
 extern int smp_found_config;
 extern void find_smp_config (void);
-- 
cgit v1.2.3


From 56f1d5d52a21b93bc2984c920b17e0d80df5d1b2 Mon Sep 17 00:00:00 2001
From: "Natalie.Protasevich@unisys.com" <Natalie.Protasevich@unisys.com>
Date: Sat, 3 Sep 2005 15:56:34 -0700
Subject: [PATCH] ES7000 platform update (i386)

This is subarch update for ES7000.  I've modified platform check code and
removed unnecessary OEM table parsing for newer systems that don't use OEM
information during boot.  Parsing the table in fact is causing problems,
and the platform doesn't get recognized.  The patch only affects the ES7000
subach.

Signed-off-by: <Natalie.Protasevich@unisys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mach-es7000/mach_mpparse.h | 30 +++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/mach-es7000/mach_mpparse.h b/include/asm-i386/mach-es7000/mach_mpparse.h
index 85809e0898d7..28a84f6185a7 100644
--- a/include/asm-i386/mach-es7000/mach_mpparse.h
+++ b/include/asm-i386/mach-es7000/mach_mpparse.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_MACH_MPPARSE_H
 #define __ASM_MACH_MPPARSE_H
 
+#include <linux/acpi.h>
+
 static inline void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, 
 				struct mpc_config_translation *translation)
 {
@@ -12,8 +14,9 @@ static inline void mpc_oem_pci_bus(struct mpc_config_bus *m,
 {
 }
 
-extern int parse_unisys_oem (char *oemptr, int oem_entries);
-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length);
+extern int parse_unisys_oem (char *oemptr);
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void setup_unisys();
 
 static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
 		char *productid)
@@ -22,18 +25,33 @@ static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
 		struct mp_config_oemtable *oem_table = 
 			(struct mp_config_oemtable *)mpc->mpc_oemptr;
 		if (!strncmp(oem, "UNISYS", 6))
-			return parse_unisys_oem((char *)oem_table, oem_table->oem_length);
+			return parse_unisys_oem((char *)oem_table);
 	}
 	return 0;
 }
 
+static inline int es7000_check_dsdt()
+{
+	struct acpi_table_header *header = NULL;
+	if(!acpi_get_table_header_early(ACPI_DSDT, &header))
+		acpi_table_print(header, 0);
+	if (!strncmp(header->oem_id, "UNISYS", 6))
+		return 1;
+	return 0;
+}
+
 /* Hook from generic ACPI tables.c */
 static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	unsigned long oem_addr; 
-	int oem_entries;
-	if (!find_unisys_acpi_oem_table(&oem_addr, &oem_entries))
-		return parse_unisys_oem((char *)oem_addr, oem_entries);
+	if (!find_unisys_acpi_oem_table(&oem_addr)) {
+		if (es7000_check_dsdt())
+			return parse_unisys_oem((char *)oem_addr);
+		else {
+			setup_unisys();
+			return 1;
+		}
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4bb0d3ec3e5b1e9e2399cdc641b3b6521ac9cdaa Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:36 -0700
Subject: [PATCH] i386: inline asm cleanup

i386 Inline asm cleanup.  Use cr/dr accessor functions.

Also, a potential bugfix.  Also, some CR accessors really should be volatile.
Reads from CR0 (numeric state may change in an exception handler), writes to
CR4 (flipping CR4.TSD) and reads from CR2 (page fault) prevent instruction
re-ordering.  I did not add memory clobber to CR3 / CR4 / CR0 updates, as it
was not there to begin with, and in no case should kernel memory be clobbered,
except when doing a TLB flush, which already has memory clobber.

I noticed that page invalidation does not have a memory clobber.  I can't find
a bug as a result, but there is definitely a potential for a bug here:

#define __flush_tlb_single(addr) \
	__asm__ __volatile__("invlpg %0": :"m" (*(char *) addr))

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/agp.h       |  2 +-
 include/asm-i386/bugs.h      |  5 ++++-
 include/asm-i386/processor.h | 22 +++++++++-------------
 include/asm-i386/system.h    | 28 +++++++++++++++++++++++++---
 include/asm-i386/xor.h       | 26 +++++++++++++-------------
 5 files changed, 52 insertions(+), 31 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/agp.h b/include/asm-i386/agp.h
index b82f5f3ab887..9075083bab76 100644
--- a/include/asm-i386/agp.h
+++ b/include/asm-i386/agp.h
@@ -19,7 +19,7 @@ int unmap_page_from_agp(struct page *page);
 /* Could use CLFLUSH here if the cpu supports it. But then it would
    need to be called for each cacheline of the whole page so it may not be 
    worth it. Would need a page for it. */
-#define flush_agp_cache() asm volatile("wbinvd":::"memory")
+#define flush_agp_cache() wbinvd()
 
 /* Convert a physical address to an address suitable for the GART. */
 #define phys_to_gart(x) (x)
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index 6789fc275da3..ea54540638d2 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -118,7 +118,10 @@ static void __init check_hlt(void)
 		printk("disabled\n");
 		return;
 	}
-	__asm__ __volatile__("hlt ; hlt ; hlt ; hlt");
+	halt();
+	halt();
+	halt();
+	halt();
 	printk("OK.\n");
 }
 
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index d0d8b0160090..7e17d3b4f65a 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -203,9 +203,7 @@ static inline unsigned int cpuid_edx(unsigned int op)
 	return edx;
 }
 
-#define load_cr3(pgdir) \
-	asm volatile("movl %0,%%cr3": :"r" (__pa(pgdir)))
-
+#define load_cr3(pgdir) write_cr3(__pa(pgdir))
 
 /*
  * Intel CPU features in CR4
@@ -232,22 +230,20 @@ extern unsigned long mmu_cr4_features;
 
 static inline void set_in_cr4 (unsigned long mask)
 {
+	unsigned cr4;
 	mmu_cr4_features |= mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"orl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (mask)
-		:"ax");
+	cr4 = read_cr4();
+	cr4 |= mask;
+	write_cr4(cr4);
 }
 
 static inline void clear_in_cr4 (unsigned long mask)
 {
+	unsigned cr4;
 	mmu_cr4_features &= ~mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"andl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (~mask)
-		:"ax");
+	cr4 = read_cr4();
+	cr4 &= ~mask;
+	write_cr4(cr4);
 }
 
 /*
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 3db717a244f0..8048a5e018cd 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -107,13 +107,33 @@ static inline unsigned long _get_base(char * addr)
 #define clts() __asm__ __volatile__ ("clts")
 #define read_cr0() ({ \
 	unsigned int __dummy; \
-	__asm__( \
+	__asm__ __volatile__( \
 		"movl %%cr0,%0\n\t" \
 		:"=r" (__dummy)); \
 	__dummy; \
 })
 #define write_cr0(x) \
-	__asm__("movl %0,%%cr0": :"r" (x));
+	__asm__ __volatile__("movl %0,%%cr0": :"r" (x));
+
+#define read_cr2() ({ \
+	unsigned int __dummy; \
+	__asm__ __volatile__( \
+		"movl %%cr2,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr2(x) \
+	__asm__ __volatile__("movl %0,%%cr2": :"r" (x));
+
+#define read_cr3() ({ \
+	unsigned int __dummy; \
+	__asm__ ( \
+		"movl %%cr3,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr3(x) \
+	__asm__ __volatile__("movl %0,%%cr3": :"r" (x));
 
 #define read_cr4() ({ \
 	unsigned int __dummy; \
@@ -123,7 +143,7 @@ static inline unsigned long _get_base(char * addr)
 	__dummy; \
 })
 #define write_cr4(x) \
-	__asm__("movl %0,%%cr4": :"r" (x));
+	__asm__ __volatile__("movl %0,%%cr4": :"r" (x));
 #define stts() write_cr0(8 | read_cr0())
 
 #endif	/* __KERNEL__ */
@@ -447,6 +467,8 @@ struct alt_instr {
 #define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete */
 #define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
 
 #define irqs_disabled()			\
 ({					\
diff --git a/include/asm-i386/xor.h b/include/asm-i386/xor.h
index f80e2dbe1b56..23c86cef3b25 100644
--- a/include/asm-i386/xor.h
+++ b/include/asm-i386/xor.h
@@ -535,14 +535,14 @@ static struct xor_block_template xor_block_p5_mmx = {
 
 #define XMMS_SAVE do {				\
 	preempt_disable();			\
+	cr0 = read_cr0();			\
+	clts();					\
 	__asm__ __volatile__ ( 			\
-		"movl %%cr0,%0		;\n\t"	\
-		"clts			;\n\t"	\
-		"movups %%xmm0,(%1)	;\n\t"	\
-		"movups %%xmm1,0x10(%1)	;\n\t"	\
-		"movups %%xmm2,0x20(%1)	;\n\t"	\
-		"movups %%xmm3,0x30(%1)	;\n\t"	\
-		: "=&r" (cr0)			\
+		"movups %%xmm0,(%0)	;\n\t"	\
+		"movups %%xmm1,0x10(%0)	;\n\t"	\
+		"movups %%xmm2,0x20(%0)	;\n\t"	\
+		"movups %%xmm3,0x30(%0)	;\n\t"	\
+		:				\
 		: "r" (xmm_save) 		\
 		: "memory");			\
 } while(0)
@@ -550,14 +550,14 @@ static struct xor_block_template xor_block_p5_mmx = {
 #define XMMS_RESTORE do {			\
 	__asm__ __volatile__ ( 			\
 		"sfence			;\n\t"	\
-		"movups (%1),%%xmm0	;\n\t"	\
-		"movups 0x10(%1),%%xmm1	;\n\t"	\
-		"movups 0x20(%1),%%xmm2	;\n\t"	\
-		"movups 0x30(%1),%%xmm3	;\n\t"	\
-		"movl 	%0,%%cr0	;\n\t"	\
+		"movups (%0),%%xmm0	;\n\t"	\
+		"movups 0x10(%0),%%xmm1	;\n\t"	\
+		"movups 0x20(%0),%%xmm2	;\n\t"	\
+		"movups 0x30(%0),%%xmm3	;\n\t"	\
 		:				\
-		: "r" (cr0), "r" (xmm_save)	\
+		: "r" (xmm_save)		\
 		: "memory");			\
+	write_cr0(cr0);				\
 	preempt_enable();			\
 } while(0)
 
-- 
cgit v1.2.3


From 245067d1674d451855692fcd4647daf9fd47f82d Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:37 -0700
Subject: [PATCH] i386: cleanup serialize msr

i386 arch cleanup.  Introduce the serialize macro to serialize processor
state.  Why the microcode update needs it I am not quite sure, since wrmsr()
is already a serializing instruction, but it is a microcode update, so I will
keep the semantic the same, since this could be a timing workaround.  As far
as I can tell, this has always been there since the original microcode update
source.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/processor.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 7e17d3b4f65a..a0489b222702 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -277,6 +277,11 @@ static inline void clear_in_cr4 (unsigned long mask)
 	outb((data), 0x23); \
 } while (0)
 
+static inline void serialize_cpu(void)
+{
+	 __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+}
+
 static inline void __monitor(const void *eax, unsigned long ecx,
 		unsigned long edx)
 {
-- 
cgit v1.2.3


From 4d37e7e3fd851428dede4d05d3e69d03795a744a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:38 -0700
Subject: [PATCH] i386: inline assembler: cleanup and encapsulate descriptor
 and task register management

i386 inline assembler cleanup.

This change encapsulates descriptor and task register management.  Also,
it is possible to improve assembler generation in two cases; savesegment
may store the value in a register instead of a memory location, which
allows GCC to optimize stack variables into registers, and MOV MEM, SEG
is always a 16-bit write to memory, making the casting in math-emu
unnecessary.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/desc.h   | 10 ++++++++++
 include/asm-i386/system.h |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 11e67811a990..9a0b85a52e71 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -30,6 +30,16 @@ extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
 #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8))
 #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8))
 
+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
+
+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
+#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
+
 /*
  * This is the ldt that every process will get unless we need
  * something other than this.
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 8048a5e018cd..37fd2f8c7196 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -93,13 +93,13 @@ static inline unsigned long _get_base(char * addr)
 		".align 4\n\t"			\
 		".long 1b,3b\n"			\
 		".previous"			\
-		: :"m" (value))
+		: :"rm" (value))
 
 /*
  * Save a segment register away
  */
 #define savesegment(seg, value) \
-	asm volatile("mov %%" #seg ",%0":"=m" (value))
+	asm volatile("mov %%" #seg ",%0":"=rm" (value))
 
 /*
  * Clear and set 'TS' bit respectively
-- 
cgit v1.2.3


From 2f2984eb4afb2a4298e3186cb49cc7e88dd6d929 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:38 -0700
Subject: [PATCH] i386: generate better code around descriptor update and
 access functions

GCC can generate better code around descriptor update and access functions
when there is not an explicit "eax" register constraint.

Testing: You won't boot if this is messed up, since the TSS descriptor will be
corrupted.  Verified the assembler and booted.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/desc.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 9a0b85a52e71..ccb11c032839 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -27,8 +27,8 @@ struct Xgt_desc_struct {
 
 extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
 
-#define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8))
+#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
+#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
 
 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
 #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
@@ -49,14 +49,14 @@ extern void set_intr_gate(unsigned int irq, void * addr);
 
 #define _set_tssldt_desc(n,addr,limit,type) \
 __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
-	"movw %%ax,2(%2)\n\t" \
-	"rorl $16,%%eax\n\t" \
-	"movb %%al,4(%2)\n\t" \
+	"movw %w1,2(%2)\n\t" \
+	"rorl $16,%1\n\t" \
+	"movb %b1,4(%2)\n\t" \
 	"movb %4,5(%2)\n\t" \
 	"movb $0,6(%2)\n\t" \
-	"movb %%ah,7(%2)\n\t" \
-	"rorl $16,%%eax" \
-	: "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type))
+	"movb %h1,7(%2)\n\t" \
+	"rorl $16,%1" \
+	: "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
 
 static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
 {
-- 
cgit v1.2.3


From c9b02a24130e3ff14a553d966a79f46cf806b037 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:40 -0700
Subject: [PATCH] i386: use set_pte macros in a couple places where they were
 missing

Also, setting PDPEs in PAE mode does not require atomic operations, since the
PDPEs are cached by the processor, and only reloaded on an explicit or
implicit reload of CR3.

Since the four PDPEs must always be present in an active root, and the kernel
PDPE is never updated, we are safe even from SMIs and interrupts / NMIs using
task gates (which reload CR3).  Actually, much of this is moot, since the user
PDPEs are never updated either, and the only usage of task gates is by the
doublefault handler.  It appears the only place PGDs get updated in PAE mode
is in init_low_mappings() / zap_low_mapping() for initial page table creation
and recovery from ACPI sleep state, and these sites are safe by inspection.
Getting rid of the cmpxchg8b saves code space and 720 cycles in pgd_alloc on
P4.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable-3level.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index d609f9c2c1f0..2e3f4a344a2d 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -64,7 +64,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 #define set_pmd(pmdptr,pmdval) \
 		set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
 #define set_pud(pudptr,pudval) \
-		set_64bit((unsigned long long *)(pudptr),pud_val(pudval))
+		(*(pudptr) = (pudval))
 
 /*
  * Pentium-II erratum A13: in PAE mode we explicitly have to flush
-- 
cgit v1.2.3


From 4f0cb8d978ab4b6e3b40147f619f48316d9d7f63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 3 Sep 2005 15:56:41 -0700
Subject: [PATCH] i386: fix incorrect TSS entry for LDT

Noticed by Chuck Ebbert: the .ldt entry of the TSS was set up incorrectly.
It never mattered since this was a leftover from old times, so remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/processor.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index a0489b222702..992224bff549 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -475,7 +475,6 @@ struct thread_struct {
 	.esp0		= sizeof(init_stack) + (long)&init_stack,	\
 	.ss0		= __KERNEL_DS,					\
 	.ss1		= __KERNEL_CS,					\
-	.ldt		= GDT_ENTRY_LDT,				\
 	.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,			\
 	.io_bitmap	= { [ 0 ... IO_BITMAP_LONGS] = ~0 },		\
 }
-- 
cgit v1.2.3


From f2ab4461249df85b20930a7a57b54f39c5ae291a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:42 -0700
Subject: [PATCH] x86: more asm cleanups

Some more assembler cleanups I noticed along the way.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/msr.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index c76fce8badbb..62b76cd96957 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -47,6 +47,21 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
 		     : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT));\
 	ret__; })
 
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ int ret__;						\
+	asm volatile("2: rdmsr ; xorl %0,%0\n"						\
+		     "1:\n\t"								\
+		     ".section .fixup,\"ax\"\n\t"					\
+		     "3:  movl %4,%0 ; jmp 1b\n\t"					\
+		     ".previous\n\t"							\
+ 		     ".section __ex_table,\"a\"\n"					\
+		     "   .align 4\n\t"							\
+		     "   .long 	2b,3b\n\t"						\
+		     ".previous"							\
+		     : "=r" (ret__), "=a" (*(a)), "=d" (*(b))				\
+		     : "c" (msr), "i" (-EFAULT));\
+	ret__; })
+
 #define rdtsc(low,high) \
      __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
 
-- 
cgit v1.2.3


From 0998e4228aca046fbd747c3fed909791d52e88eb Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:43 -0700
Subject: [PATCH] x86: privilege cleanup

Privilege checking cleanup.  Originally, these diffs were much greater, but
recent cleanups in Linux have already done much of the cleanup.  I added
some explanatory comments in places where the reasoning behind certain
tests is rather subtle.

Also, in traps.c, we can skip the user_mode check in handle_BUG().  The
reason is, there are only two call chains - one via die_if_kernel() and one
via do_page_fault(), both entering from die().  Both of these paths already
ensure that a kernel mode failure has happened.  Also, the original check
here, if (user_mode(regs)) was insufficient anyways, since it would not
rule out BUG faults from V8086 mode execution.

Saving the %ss segment in show_regs() rather than assuming a fixed value
also gives better information about the current kernel state in the
register dump.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/ptrace.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 05532875e39e..7e0f2945d17d 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -61,6 +61,13 @@ struct pt_regs {
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
 
+/*
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct pt_regs *regs)
 {
 	return (regs->xcs & 3) != 0;
-- 
cgit v1.2.3


From a5201129307f414890f9a4410e38da205f5d7359 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:44 -0700
Subject: [PATCH] x86: make IOPL explicit

The pushf/popf in switch_to are ONLY used to switch IOPL.  Making this
explicit in C code is more clear.  This pushf/popf pair was added as a
bugfix for leaking IOPL to unprivileged processes when using
sysenter/sysexit based system calls (sysexit does not restore flags).

When requesting an IOPL change in sys_iopl(), it is just as easy to change
the current flags and the flags in the stack image (in case an IRET is
required), but there is no reason to force an IRET if we came in from the
SYSENTER path.

This change is the minimal solution for supporting a paravirtualized Linux
kernel that allows user processes to run with I/O privilege.  Other
solutions require radical rewrites of part of the low level fault / system
call handling code, or do not fully support sysenter based system calls.

Unfortunately, this added one field to the thread_struct.  But as a bonus,
on P4, the fastest time measured for switch_to() went from 312 to 260
cycles, a win of about 17% in the fast case through this performance
critical path.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/processor.h | 16 ++++++++++++++++
 include/asm-i386/system.h    |  4 +---
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 992224bff549..37bef8ed7bed 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -455,6 +455,7 @@ struct thread_struct {
 	unsigned int		saved_fs, saved_gs;
 /* IO permissions */
 	unsigned long	*io_bitmap_ptr;
+ 	unsigned long	iopl;
 /* max allowed port in the bitmap, in bytes: */
 	unsigned long	io_bitmap_max;
 };
@@ -511,6 +512,21 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
 			: /* no output */			\
 			:"r" (value))
 
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void set_iopl_mask(unsigned mask)
+{
+	unsigned int reg;
+	__asm__ __volatile__ ("pushfl;"
+			      "popl %0;"
+			      "andl %1, %0;"
+			      "orl %2, %0;"
+			      "pushl %0;"
+			      "popfl"
+				: "=&r" (reg)
+				: "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
 
 /* Forward declaration, a strange C thing */
 struct task_struct;
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 37fd2f8c7196..acd5c26b69ba 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -14,8 +14,7 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 
 #define switch_to(prev,next,last) do {					\
 	unsigned long esi,edi;						\
-	asm volatile("pushfl\n\t"					\
-		     "pushl %%ebp\n\t"					\
+	asm volatile("pushl %%ebp\n\t"					\
 		     "movl %%esp,%0\n\t"	/* save ESP */		\
 		     "movl %5,%%esp\n\t"	/* restore ESP */	\
 		     "movl $1f,%1\n\t"		/* save EIP */		\
@@ -23,7 +22,6 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 		     "jmp __switch_to\n"				\
 		     "1:\t"						\
 		     "popl %%ebp\n\t"					\
-		     "popfl"						\
 		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
 		      "=a" (last),"=S" (esi),"=D" (edi)			\
 		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
-- 
cgit v1.2.3


From f2f30ebca6c0c95e987cb9a1fd1495770a75432e Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:47 -0700
Subject: [PATCH] x86: introduce a write acessor for updating the current LDT

Introduce a write acessor for updating the current LDT.  This is required
for hypervisors like Xen that do not allow LDT pages to be directly
written.

Testing - here's a fun little LDT test that can be trivially modified to
test limits as well.

/*
 * Copyright (c) 2005, Zachary Amsden (zach@vmware.com)
 * This is licensed under the GPL.
 */

#include <stdio.h>
#include <signal.h>
#include <asm/ldt.h>
#include <asm/segment.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/mman.h>
#define __KERNEL__
#include <asm/page.h>

void main(void)
{
        struct user_desc desc;
        char *code;
        unsigned long long tsc;

        code = (char *)mmap(0, 8192, PROT_EXEC|PROT_READ|PROT_WRITE,
                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        desc.entry_number = 0;
        desc.base_addr = code;
        desc.limit = 1;
        desc.seg_32bit = 1;
        desc.contents = MODIFY_LDT_CONTENTS_CODE;
        desc.read_exec_only = 0;
        desc.limit_in_pages = 1;
        desc.seg_not_present = 0;
        desc.useable = 1;
        if (modify_ldt(1, &desc, sizeof(desc)) != 0) {
                perror("modify_ldt");
        }
        printf("code base is 0x%08x\n", (unsigned)code);
        code[0x0ffe] = 0x0f;  /* rdtsc */
        code[0x0fff] = 0x31;
        code[0x1000] = 0xcb;  /* lret */
        __asm__ __volatile("lcall $7,$0xffe" : "=A" (tsc));
        printf("TSC is 0x%016llx\n", tsc);
}

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/desc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index ccb11c032839..6df1a53c190e 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -96,6 +96,13 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
 	(info)->seg_not_present	== 1	&& \
 	(info)->useable		== 0	)
 
+static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
+{
+	__u32 *lp = (__u32 *)((char *)ldt + entry*8);
+	*lp = entry_a;
+	*(lp+1) = entry_b;
+}
+
 #if TLS_SIZE != 24
 # error update this code.
 #endif
-- 
cgit v1.2.3


From 748f2edb52712aa3d926470a888608dc500d17e8 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sat, 3 Sep 2005 15:56:48 -0700
Subject: [PATCH] x86 NMI: better support for debuggers

This patch adds a notify to the die_nmi notify that the system is about to
be taken down.  If the notify is handled with a NOTIFY_STOP return, the
system is given a new lease on life.

We also change the nmi watchdog to carry on if die_nmi returns.

This give debug code a chance to a) catch watchdog timeouts and b) possibly
allow the system to continue, realizing that the time out may be due to
debugger activities such as single stepping which is usually done with
"other" cpus held.

Signed-off-by: George Anzinger<george@mvista.com>
Cc: Keith Owens <kaos@ocs.com.au>
Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/kdebug.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index b3f8d5f59d5d..316138e89910 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -41,9 +41,16 @@ enum die_val {
 	DIE_PAGE_FAULT,
 };
 
-static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
+static inline int notify_die(enum die_val val, const char *str,
+			struct pt_regs *regs, long err, int trap, int sig)
 {
-	struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
+	struct die_args args = {
+		.regs = regs,
+		.str = str,
+		.err = err,
+		.trapnr = trap,
+		.signr = sig
+	};
 	return notifier_call_chain(&i386die_chain, val, &args);
 }
 
-- 
cgit v1.2.3


From d7271b14b2e9e5905aba0fbf5c4dc4f8980c0cb2 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:50 -0700
Subject: [PATCH] i386: encapsulate copying of pgd entries

Add a clone operation for pgd updates.

This helps complete the encapsulation of updates to page tables (or pages
about to become page tables) into accessor functions rather than using
memcpy() to duplicate them.  This is both generally good for consistency
and also necessary for running in a hypervisor which requires explicit
updates to page table entries.

The new function is:

clone_pgd_range(pgd_t *dst, pgd_t *src, int count);

   dst - pointer to pgd range anwhere on a pgd page
   src - ""
   count - the number of pgds to copy.

   dst and src can be on the same page, but the range must not overlap
   and must not cross a page boundary.

Note that I ommitted using this call to copy pgd entries into the
software suspend page root, since this is not technically a live paging
structure, rather it is used on resume from suspend.  CC'ing Pavel in case
he has any feedback on this.

Thanks to Chris Wright for noticing that this could be more optimal in
PAE compiles by eliminating the memset.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index d74185aee15b..47bc1ffa3d4c 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -277,6 +277,21 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 	clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
 }
 
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ *  dst - pointer to pgd range anwhere on a pgd page
+ *  src - ""
+ *  count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+       memcpy(dst, src, count * sizeof(pgd_t));
+}
+
 /*
  * Macro to mark a page protection value as "uncacheable".  On processors which do not support
  * it, this is a no-op.
-- 
cgit v1.2.3


From 4ad8d38342430f8b52f7a8458dce90caf8c8ca64 Mon Sep 17 00:00:00 2001
From: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Date: Sat, 3 Sep 2005 15:56:51 -0700
Subject: [PATCH] i386 boottime for_each_cpu broken

for_each_cpu walks through all processors in cpu_possible_map, which is
defined as cpu_callout_map on i386 and isn't initialised until all
processors have been booted. This breaks things which do for_each_cpu
iterations early during boot. So, define cpu_possible_map as a bitmap with
NR_CPUS bits populated. This was triggered by a patch i'm working on which
does alloc_percpu before bringing up secondary processors.

From: Alexander Nyberg <alexn@telia.com>

i386-boottime-for_each_cpu-broken.patch
i386-boottime-for_each_cpu-broken-fix.patch

The SMP version of __alloc_percpu checks the cpu_possible_map before
allocating memory for a certain cpu.  With the above patches the BSP cpuid
is never set in cpu_possible_map which breaks CONFIG_SMP on uniprocessor
machines (as soon as someone tries to dereference something allocated via
__alloc_percpu, which in fact is never allocated since the cpu is not set
in cpu_possible_map).

Signed-off-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/smp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index a283738b80b3..13250199976d 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -59,7 +59,7 @@ extern void cpu_uninit(void);
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
-#define cpu_possible_map cpu_callout_map
+extern cpumask_t cpu_possible_map;
 
 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
 static inline int num_booting_cpus(void)
-- 
cgit v1.2.3


From c3c433e4f33afe255389ba3b1a003dc8deb3de9a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sat, 3 Sep 2005 15:57:07 -0700
Subject: [PATCH] add suspend/resume for timer

The timers lack .suspend/.resume methods.  Because of this, jiffies got a
big compensation after a S3 resume.  And then softlockup watchdog reports
an oops.  This occured with HPET enabled, but it's also possible for other
timers.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/timer.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index dcf1e07db08a..aed16437479d 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -1,6 +1,7 @@
 #ifndef _ASMi386_TIMER_H
 #define _ASMi386_TIMER_H
 #include <linux/init.h>
+#include <linux/pm.h>
 
 /**
  * struct timer_ops - used to define a timer source
@@ -23,6 +24,8 @@ struct timer_opts {
 	unsigned long long (*monotonic_clock)(void);
 	void (*delay)(unsigned long);
 	unsigned long (*read_timer)(void);
+	int (*suspend)(pm_message_t state);
+	int (*resume)(void);
 };
 
 struct init_timer_opts {
-- 
cgit v1.2.3


From ed75e8d58010fdc06e2c3a81bfbebae92314c7e3 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <LaurentVivier@wanadoo.fr>
Date: Sat, 3 Sep 2005 15:57:18 -0700
Subject: [PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML
 and general usage

      Jeff Dike <jdike@addtoit.com>,
      Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>,
      Bodo Stroesser <bstroesser@fujitsu-siemens.com>

Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL
except that the kernel does not execute the requested syscall; this is useful
to improve performance for virtual environments, like UML, which want to run
the syscall on their own.

In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry
and on exit, and each time you also have two context switches; with SYSEMU you
avoid the 2nd stop and so save two context switches per syscall.

Also, some architectures don't have support in the host for changing the
syscall number via ptrace(), which is currently needed to skip syscall
execution (UML turns any syscall into getpid() to avoid it being executed on
the host).  Fixing that is hard, while SYSEMU is easier to implement.

* This version of the patch includes some suggestions of Jeff Dike to avoid
  adding any instructions to the syscall fast path, plus some other little
  changes, by myself, to make it work even when the syscall is executed with
  SYSENTER (but I'm unsure about them). It has been widely tested for quite a
  lot of time.

* Various fixed were included to handle the various switches between
  various states, i.e. when for instance a syscall entry is traced with one of
  PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit.
  Basically, this is done by remembering which one of them was used even after
  the call to ptrace_notify().

* We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP
  to make do_syscall_trace() notice that the current syscall was started with
  SYSEMU on entry, so that no notification ought to be done in the exit path;
  this is a bit of a hack, so this problem is solved in another way in next
  patches.

* Also, the effects of the patch:
"Ptrace - i386: fix Syscall Audit interaction with singlestep"
are cancelled; they are restored back in the last patch of this series.

Detailed descriptions of the patches doing this kind of processing follow (but
I've already summed everything up).

* Fix behaviour when changing interception kind #1.

  In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag
  only after doing the debugger notification; but the debugger might have
  changed the status of this flag because he continued execution with
  PTRACE_SYSCALL, so this is wrong.  This patch fixes it by saving the flag
  status before calling ptrace_notify().

* Fix behaviour when changing interception kind #2:
  avoid intercepting syscall on return when using SYSCALL again.

  A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL
  crashes.

  The problem is in arch/i386/kernel/entry.S.  The current SYSEMU patch
  inhibits the syscall-handler to be called, but does not prevent
  do_syscall_trace() to be called after this for syscall completion
  interception.

  The appended patch fixes this.  It reuses the flag TIF_SYSCALL_EMU to
  remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since
  the flag is unused in the depicted situation.

* Fix behaviour when changing interception kind #3:
  avoid intercepting syscall on return when using SINGLESTEP.

  When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had
  problems with singlestepping on UML in SKAS with SYSEMU.  It looped
  receiving SIGTRAPs without moving forward.  EIP of the traced process was
  the same for all SIGTRAPs.

What's missing is to handle switching from PTRACE_SYSCALL_EMU to
PTRACE_SINGLESTEP in a way very similar to what is done for the change from
PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE.

I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is
notified and then wake ups the process; the syscall is executed (or skipped,
when do_syscall_trace() returns 0, i.e.  when using PTRACE_SYSEMU), and
do_syscall_trace() is called again.  Since we are on the return path of a
SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL),
we must still avoid notifying the parent of the syscall exit.  Now, this
behaviour is extended even to resuming with PTRACE_SINGLESTEP.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/thread_info.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 95add81237ea..e2cb9fa6f563 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -139,6 +139,7 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
+#define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -150,13 +151,15 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|\
+		  _TIF_SECCOMP|_TIF_SYSCALL_EMU))
 /* work to do on any return to u-space */
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
-- 
cgit v1.2.3


From 4732efbeb997189d9f9b04708dc26bf8613ed721 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 6 Sep 2005 15:16:25 -0700
Subject: [PATCH] FUTEX_WAKE_OP: pthread_cond_signal() speedup

ATM pthread_cond_signal is unnecessarily slow, because it wakes one waiter
(which at least on UP usually means an immediate context switch to one of
the waiter threads).  This waiter wakes up and after a few instructions it
attempts to acquire the cv internal lock, but that lock is still held by
the thread calling pthread_cond_signal.  So it goes to sleep and eventually
the signalling thread is scheduled in, unlocks the internal lock and wakes
the waiter again.

Now, before 2003-09-21 NPTL was using FUTEX_REQUEUE in pthread_cond_signal
to avoid this performance issue, but it was removed when locks were
redesigned to the 3 state scheme (unlocked, locked uncontended, locked
contended).

Following scenario shows why simply using FUTEX_REQUEUE in
pthread_cond_signal together with using lll_mutex_unlock_force in place of
lll_mutex_unlock is not enough and probably why it has been disabled at
that time:

The number is value in cv->__data.__lock.
        thr1            thr2            thr3
0       pthread_cond_wait
1       lll_mutex_lock (cv->__data.__lock)
0       lll_mutex_unlock (cv->__data.__lock)
0       lll_futex_wait (&cv->__data.__futex, futexval)
0                       pthread_cond_signal
1                       lll_mutex_lock (cv->__data.__lock)
1                                       pthread_cond_signal
2                                       lll_mutex_lock (cv->__data.__lock)
2                                         lll_futex_wait (&cv->__data.__lock, 2)
2                       lll_futex_requeue (&cv->__data.__futex, 0, 1, &cv->__data.__lock)
                          # FUTEX_REQUEUE, not FUTEX_CMP_REQUEUE
2                       lll_mutex_unlock_force (cv->__data.__lock)
0                         cv->__data.__lock = 0
0                         lll_futex_wake (&cv->__data.__lock, 1)
1       lll_mutex_lock (cv->__data.__lock)
0       lll_mutex_unlock (cv->__data.__lock)
          # Here, lll_mutex_unlock doesn't know there are threads waiting
          # on the internal cv's lock

Now, I believe it is possible to use FUTEX_REQUEUE in pthread_cond_signal,
but it will cost us not one, but 2 extra syscalls and, what's worse, one of
these extra syscalls will be done for every single waiting loop in
pthread_cond_*wait.

We would need to use lll_mutex_unlock_force in pthread_cond_signal after
requeue and lll_mutex_cond_lock in pthread_cond_*wait after lll_futex_wait.

Another alternative is to do the unlocking pthread_cond_signal needs to do
(the lock can't be unlocked before lll_futex_wake, as that is racy) in the
kernel.

I have implemented both variants, futex-requeue-glibc.patch is the first
one and futex-wake_op{,-glibc}.patch is the unlocking inside of the kernel.
 The kernel interface allows userland to specify how exactly an unlocking
operation should look like (some atomic arithmetic operation with optional
constant argument and comparison of the previous futex value with another
constant).

It has been implemented just for ppc*, x86_64 and i?86, for other
architectures I'm including just a stub header which can be used as a
starting point by maintainers to write support for their arches and ATM
will just return -ENOSYS for FUTEX_WAKE_OP.  The requeue patch has been
(lightly) tested just on x86_64, the wake_op patch on ppc64 kernel running
32-bit and 64-bit NPTL and x86_64 kernel running 32-bit and 64-bit NPTL.

With the following benchmark on UP x86-64 I get:

for i in nptl-orig nptl-requeue nptl-wake_op; do echo time elf/ld.so --library-path .:$i /tmp/bench; \
for j in 1 2; do echo ( time elf/ld.so --library-path .:$i /tmp/bench ) 2>&1; done; done
time elf/ld.so --library-path .:nptl-orig /tmp/bench
real 0m0.655s user 0m0.253s sys 0m0.403s
real 0m0.657s user 0m0.269s sys 0m0.388s
time elf/ld.so --library-path .:nptl-requeue /tmp/bench
real 0m0.496s user 0m0.225s sys 0m0.271s
real 0m0.531s user 0m0.242s sys 0m0.288s
time elf/ld.so --library-path .:nptl-wake_op /tmp/bench
real 0m0.380s user 0m0.176s sys 0m0.204s
real 0m0.382s user 0m0.175s sys 0m0.207s

The benchmark is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00001.txt
Older futex-requeue-glibc.patch version is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00002.txt
Older futex-wake_op-glibc.patch version is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00003.txt
Will post a new version (just x86-64 fixes so that the patch
applies against pthread_cond_signal.S) to libc-hacker ml soon.

Attached is the kernel FUTEX_WAKE_OP patch as well as a simple-minded
testcase that will not test the atomicity of the operation, but at least
check if the threads that should have been woken up are woken up and
whether the arithmetic operation in the kernel gave the expected results.

Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/futex.h | 108 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 include/asm-i386/futex.h

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
new file mode 100644
index 000000000000..44b9db806474
--- /dev/null
+++ b/include/asm-i386/futex.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	" insn "\n"						\
+"2:	.section .fixup,\"ax\"\n\
+3:	mov	%3, %1\n\
+	jmp	2b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.long	1b,3b\n\
+	.previous"						\
+	: "=r" (oldval), "=r" (ret), "=m" (*uaddr)		\
+	: "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	movl	%2, %0\n\
+	movl	%0, %3\n"					\
+	insn "\n"						\
+"2:	" LOCK_PREFIX "cmpxchgl %3, %2\n\
+	jnz	1b\n\
+3:	.section .fixup,\"ax\"\n\
+4:	mov	%5, %1\n\
+	jmp	3b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.long	1b,4b,2b,4b\n\
+	.previous"						\
+	: "=&a" (oldval), "=&r" (ret), "=m" (*uaddr),		\
+	  "=&r" (tem)						\
+	: "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	if (op == FUTEX_OP_SET)
+		__futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+	else {
+#ifndef CONFIG_X86_BSWAP
+		if (boot_cpu_data.x86 == 3)
+			ret = -ENOSYS;
+		else
+#endif
+		switch (op) {
+		case FUTEX_OP_ADD:
+			__futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
+					   oldval, uaddr, oparg);
+			break;
+		case FUTEX_OP_OR:
+			__futex_atomic_op2("orl %4, %3", ret, oldval, uaddr,
+					   oparg);
+			break;
+		case FUTEX_OP_ANDN:
+			__futex_atomic_op2("andl %4, %3", ret, oldval, uaddr,
+					   ~oparg);
+			break;
+		case FUTEX_OP_XOR:
+			__futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr,
+					   oparg);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
-- 
cgit v1.2.3


From 36d57ac4a818cb4aa3edbdf63ad2ebc31106f925 Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl@lucon.org>
Date: Tue, 6 Sep 2005 15:16:49 -0700
Subject: [PATCH] auxiliary vector cleanups

The size of auxiliary vector is fixed at 42 in linux/sched.h.  But it isn't
very obvious when looking at linux/elf.h.  This patch adds AT_VECTOR_SIZE
so that we can change it if necessary when a new vector is added.

Because of include file ordering problems, doing this necessitated the
extraction of the AT_* symbols into a standalone header file.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/auxvec.h | 11 +++++++++++
 include/asm-i386/elf.h    |  8 +-------
 2 files changed, 12 insertions(+), 7 deletions(-)
 create mode 100644 include/asm-i386/auxvec.h

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/auxvec.h b/include/asm-i386/auxvec.h
new file mode 100644
index 000000000000..395e13016bfb
--- /dev/null
+++ b/include/asm-i386/auxvec.h
@@ -0,0 +1,11 @@
+#ifndef __ASMi386_AUXVEC_H
+#define __ASMi386_AUXVEC_H
+
+/*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+ * for more of them, start the x86-specific ones at 32.
+ */
+#define AT_SYSINFO		32
+#define AT_SYSINFO_EHDR		33
+
+#endif
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 130bdc8c68cf..fa11117d3cfa 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -9,6 +9,7 @@
 #include <asm/user.h>
 #include <asm/processor.h>
 #include <asm/system.h>		/* for savesegment */
+#include <asm/auxvec.h>
 
 #include <linux/utsname.h>
 
@@ -109,13 +110,6 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 
 #define ELF_PLATFORM  (system_utsname.machine)
 
-/*
- * Architecture-neutral AT_ values in 0-17, leave some room
- * for more of them, start the x86-specific ones at 32.
- */
-#define AT_SYSINFO		32
-#define AT_SYSINFO_EHDR		33
-
 #ifdef __KERNEL__
 #define SET_PERSONALITY(ex, ibcs2) do { } while (0)
 
-- 
cgit v1.2.3


From c8d127418d78aaeeb1a417ef7453dc09c9118146 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 Sep 2005 15:17:27 -0700
Subject: [PATCH] remove asm-*/hdreg.h

unused and useless..

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/hdreg.h | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 include/asm-i386/hdreg.h

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/hdreg.h b/include/asm-i386/hdreg.h
deleted file mode 100644
index 5989bbc97cbf..000000000000
--- a/include/asm-i386/hdreg.h
+++ /dev/null
@@ -1 +0,0 @@
-#warning this file is obsolete, please do not use it
-- 
cgit v1.2.3


From 97de50c0add1e8f3b4e764c66a13c07235fee631 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Tue, 6 Sep 2005 15:17:49 -0700
Subject: [PATCH] remove verify_area(): remove verify_area() from various
 uaccess.h headers

Remove the deprecated (and unused) verify_area() from various uaccess.h
headers.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/uaccess.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 886867aea947..89ab7e2bc5aa 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -83,30 +83,6 @@ extern struct movsl_mask {
  */
 #define access_ok(type,addr,size) (likely(__range_ok(addr,size) == 0))
 
-/**
- * verify_area: - Obsolete/deprecated and will go away soon,
- * use access_ok() instead.
- * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE
- * @addr: User space pointer to start of block to check
- * @size: Size of block to check
- *
- * Context: User context only.  This function may sleep.
- *
- * This function has been replaced by access_ok().
- *
- * Checks if a pointer to a block of memory in user space is valid.
- *
- * Returns zero if the memory block may be valid, -EFAULT
- * if it is definitely invalid.
- *
- * See access_ok() for more details.
- */
-static inline int __deprecated verify_area(int type, const void __user * addr, unsigned long size)
-{
-	return access_ok(type,addr,size) ? 0 : -EFAULT;
-}
-
-
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
-- 
cgit v1.2.3


From 9317259ead88fe6c05120ae1e3ace99738e2c698 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 6 Sep 2005 15:17:57 -0700
Subject: [PATCH] Create asm-generic/fcntl.h

This set of patches creates asm-generic/fcntl.h and consolidates as much as
possible from the asm-*/fcntl.h files into it.

This patch just gathers all the identical bits of the asm-*/fcntl.h files into
asm-generic/fcntl.h.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/fcntl.h | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/fcntl.h b/include/asm-i386/fcntl.h
index 511cde94a3ed..b51233c6a720 100644
--- a/include/asm-i386/fcntl.h
+++ b/include/asm-i386/fcntl.h
@@ -3,10 +3,6 @@
 
 /* open/fcntl - O_SYNC is only implemented on blocks devices and on files
    located on an ext2 file system */
-#define O_ACCMODE	   0003
-#define O_RDONLY	     00
-#define O_WRONLY	     01
-#define O_RDWR		     02
 #define O_CREAT		   0100	/* not fcntl */
 #define O_EXCL		   0200	/* not fcntl */
 #define O_NOCTTY	   0400	/* not fcntl */
@@ -22,11 +18,6 @@
 #define O_NOFOLLOW	0400000 /* don't follow links */
 #define O_NOATIME	01000000
 
-#define F_DUPFD		0	/* dup */
-#define F_GETFD		1	/* get close_on_exec */
-#define F_SETFD		2	/* set/clear close_on_exec */
-#define F_GETFL		3	/* get file->f_flags */
-#define F_SETFL		4	/* set file->f_flags */
 #define F_GETLK		5
 #define F_SETLK		6
 #define F_SETLKW	7
@@ -40,9 +31,6 @@
 #define F_SETLK64	13
 #define F_SETLKW64	14
 
-/* for F_[GET|SET]FL */
-#define FD_CLOEXEC	1	/* actually anything with low bit set goes */
-
 /* for posix fcntl() and lockf() */
 #define F_RDLCK		0
 #define F_WRLCK		1
@@ -55,18 +43,6 @@
 /* for leases */
 #define F_INPROGRESS	16
 
-/* operations for bsd flock(), also used by the kernel implementation */
-#define LOCK_SH		1	/* shared lock */
-#define LOCK_EX		2	/* exclusive lock */
-#define LOCK_NB		4	/* or'd with one of the above to prevent
-				   blocking */
-#define LOCK_UN		8	/* remove lock */
-
-#define LOCK_MAND	32	/* This is a mandatory flock */
-#define LOCK_READ	64	/* ... Which allows concurrent read operations */
-#define LOCK_WRITE	128	/* ... Which allows concurrent write operations */
-#define LOCK_RW		192	/* ... Which allows concurrent read & write ops */
-
 struct flock {
 	short l_type;
 	short l_whence;
@@ -83,6 +59,6 @@ struct flock64 {
 	pid_t  l_pid;
 };
 
-#define F_LINUX_SPECIFIC_BASE	1024
+#include <asm-generic/fcntl.h>
 
 #endif
-- 
cgit v1.2.3


From e64ca97fd80a129e538ca42d0b12c379746b83db Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 6 Sep 2005 15:17:58 -0700
Subject: [PATCH] Clean up the open flags

This patch puts the most popular of each open flag into asm-generic/fcntl.h
and cleans up the arch files.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/fcntl.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/fcntl.h b/include/asm-i386/fcntl.h
index b51233c6a720..4fa08ffde878 100644
--- a/include/asm-i386/fcntl.h
+++ b/include/asm-i386/fcntl.h
@@ -1,23 +1,6 @@
 #ifndef _I386_FCNTL_H
 #define _I386_FCNTL_H
 
-/* open/fcntl - O_SYNC is only implemented on blocks devices and on files
-   located on an ext2 file system */
-#define O_CREAT		   0100	/* not fcntl */
-#define O_EXCL		   0200	/* not fcntl */
-#define O_NOCTTY	   0400	/* not fcntl */
-#define O_TRUNC		  01000	/* not fcntl */
-#define O_APPEND	  02000
-#define O_NONBLOCK	  04000
-#define O_NDELAY	O_NONBLOCK
-#define O_SYNC		 010000
-#define FASYNC		 020000	/* fcntl, for BSD compatibility */
-#define O_DIRECT	 040000	/* direct disk access hint */
-#define O_LARGEFILE	0100000
-#define O_DIRECTORY	0200000	/* must be a directory */
-#define O_NOFOLLOW	0400000 /* don't follow links */
-#define O_NOATIME	01000000
-
 #define F_GETLK		5
 #define F_SETLK		6
 #define F_SETLKW	7
-- 
cgit v1.2.3


From 1abf62afb6e9cdc1b2618b69067a186b94281587 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 6 Sep 2005 15:17:59 -0700
Subject: [PATCH] Clean up the fcntl operations

This patch puts the most popular of each fcntl operation/flag into
asm-generic/fcntl.h and cleans up the arch files.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/fcntl.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/fcntl.h b/include/asm-i386/fcntl.h
index 4fa08ffde878..1d06913c3bfd 100644
--- a/include/asm-i386/fcntl.h
+++ b/include/asm-i386/fcntl.h
@@ -1,31 +1,10 @@
 #ifndef _I386_FCNTL_H
 #define _I386_FCNTL_H
 
-#define F_GETLK		5
-#define F_SETLK		6
-#define F_SETLKW	7
-
-#define F_SETOWN	8	/*  for sockets. */
-#define F_GETOWN	9	/*  for sockets. */
-#define F_SETSIG	10	/*  for sockets. */
-#define F_GETSIG	11	/*  for sockets. */
-
 #define F_GETLK64	12	/*  using 'struct flock64' */
 #define F_SETLK64	13
 #define F_SETLKW64	14
 
-/* for posix fcntl() and lockf() */
-#define F_RDLCK		0
-#define F_WRLCK		1
-#define F_UNLCK		2
-
-/* for old implementation of bsd flock () */
-#define F_EXLCK		4	/* or 3 */
-#define F_SHLCK		8	/* or 4 */
-
-/* for leases */
-#define F_INPROGRESS	16
-
 struct flock {
 	short l_type;
 	short l_whence;
-- 
cgit v1.2.3


From 5ac353f9baf7169298ebb7de86b2d697b25bca44 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 6 Sep 2005 15:18:00 -0700
Subject: [PATCH] Clean up struct flock definitions

This patch just gathers together all the struct flock definitions except
xtensa into asm-generic/fcntl.h.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/fcntl.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/fcntl.h b/include/asm-i386/fcntl.h
index 1d06913c3bfd..01f17d29d288 100644
--- a/include/asm-i386/fcntl.h
+++ b/include/asm-i386/fcntl.h
@@ -5,14 +5,6 @@
 #define F_SETLK64	13
 #define F_SETLKW64	14
 
-struct flock {
-	short l_type;
-	short l_whence;
-	off_t l_start;
-	off_t l_len;
-	pid_t l_pid;
-};
-
 struct flock64 {
 	short  l_type;
 	short  l_whence;
-- 
cgit v1.2.3


From 8d286aa5eaf951bf53d4a0f64576d4b377c435ba Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 6 Sep 2005 15:18:01 -0700
Subject: [PATCH] Clean up struct flock64 definitions

This patch gathers all the struct flock64 definitions (and the operations),
puts them under !CONFIG_64BIT and cleans up the arch files.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/fcntl.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/asm-i386')

diff --git a/include/asm-i386/fcntl.h b/include/asm-i386/fcntl.h
index 01f17d29d288..46ab12db5739 100644
--- a/include/asm-i386/fcntl.h
+++ b/include/asm-i386/fcntl.h
@@ -1,18 +1 @@
-#ifndef _I386_FCNTL_H
-#define _I386_FCNTL_H
-
-#define F_GETLK64	12	/*  using 'struct flock64' */
-#define F_SETLK64	13
-#define F_SETLKW64	14
-
-struct flock64 {
-	short  l_type;
-	short  l_whence;
-	loff_t l_start;
-	loff_t l_len;
-	pid_t  l_pid;
-};
-
 #include <asm-generic/fcntl.h>
-
-#endif
-- 
cgit v1.2.3