diff options
author | Christoph Lameter <cl@linux.com> | 2010-12-14 10:28:47 -0600 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-12-18 15:54:04 +0100 |
commit | 8270137a0d50507a5b40f880db636527045b8466 (patch) | |
tree | 3490a31fcbea09ab5fffb6b2f4330dc92896f413 /arch | |
parent | 7296e08abac0a22a2534a4f6e493c764f2c77583 (diff) | |
download | linux-8270137a0d50507a5b40f880db636527045b8466.tar.gz linux-8270137a0d50507a5b40f880db636527045b8466.tar.bz2 linux-8270137a0d50507a5b40f880db636527045b8466.zip |
cpuops: Use cmpxchg for xchg to avoid lock semantics
Use cmpxchg instead of xchg to realize this_cpu_xchg.
xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
will not.
Baselines:
xchg() = 18 cycles (no segment prefix, LOCK semantics)
__this_cpu_xchg = 1 cycle
(simulated using this_cpu_read/write, two prefixes. Looks like the
cpu can use loop optimization to get rid of most of the overhead)
Cycles before:
this_cpu_xchg = 37 cycles (segment prefix and LOCK (implied by xchg))
After:
this_cpu_xchg = 11 cycle (using cmpxchg without lock semantics)
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/percpu.h | 21 |
1 files changed, 15 insertions, 6 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index b85ade511a53..8ee45167e817 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -263,8 +263,9 @@ do { \ }) /* - * Beware: xchg on x86 has an implied lock prefix. There will be the cost of - * full lock semantics even though they are not needed. + * xchg is implemented using cmpxchg without a lock prefix. xchg is + * expensive due to the implied lock prefix. The processor cannot prefetch + * cachelines if xchg is used. */ #define percpu_xchg_op(var, nval) \ ({ \ @@ -272,25 +273,33 @@ do { \ typeof(var) pxo_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("xchgb %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%al" \ + "\n\tcmpxchgb %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "q" (pxo_new__) \ : "memory"); \ break; \ case 2: \ - asm("xchgw %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%ax" \ + "\n\tcmpxchgw %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ break; \ case 4: \ - asm("xchgl %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%eax" \ + "\n\tcmpxchgl %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ break; \ case 8: \ - asm("xchgq %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%rax" \ + "\n\tcmpxchgq %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ |