summaryrefslogtreecommitdiffstats
path: root/arch/tile/lib/memset_32.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-06 11:14:33 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-06 11:14:33 -0700
commit4de9ad9bc08b4953fc03336ad38908496e2f8826 (patch)
treebd44add223061a58317034a0d6c9686d95d12fba /arch/tile/lib/memset_32.c
parent576c25eb5954035b64112188d9a2683144600f3d (diff)
parent06da6629e68ddc8ffe2933d33b3681f09104b3f1 (diff)
downloadlinux-4de9ad9bc08b4953fc03336ad38908496e2f8826.tar.gz
linux-4de9ad9bc08b4953fc03336ad38908496e2f8826.tar.bz2
linux-4de9ad9bc08b4953fc03336ad38908496e2f8826.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile
Pull Tile arch updates from Chris Metcalf: "These changes bring in a bunch of new functionality that has been maintained internally at Tilera over the last year, plus other stray bits of work that I've taken into the tile tree from other folks. The changes include some PCI root complex work, interrupt-driven console support, support for performing fast-path unaligned data fixups by kernel-based JIT code generation, CONFIG_PREEMPT support, vDSO support for gettimeofday(), a serial driver for the tilegx on-chip UART, KGDB support, more optimized string routines, support for ftrace and kprobes, improved ASLR, and many bug fixes. We also remove support for the old TILE64 chip, which is no longer buildable" * git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile: (85 commits) tile: refresh tile defconfig files tile: rework <asm/cmpxchg.h> tile PCI RC: make default consistent DMA mask 32-bit tile: add null check for kzalloc in tile/kernel/setup.c tile: make __write_once a synonym for __read_mostly tile: remove support for TILE64 tile: use asm-generic/bitops/builtin-*.h tile: eliminate no-op "noatomichash" boot argument tile: use standard tile_bundle_bits type in traps.c tile: simplify code referencing hypervisor API addresses tile: change <asm/system.h> to <asm/switch_to.h> in comments tile: mark pcibios_init() as __init tile: check for correct compiler earlier in asm-offsets.c tile: use standard 'generic-y' model for <asm/hw_irq.h> tile: use asm-generic version of <asm/local64.h> tile PCI RC: add comment about "PCI hole" problem tile: remove DEBUG_EXTRA_FLAGS kernel config option tile: add virt_to_kpte() API and clean up and document behavior tile: support FRAME_POINTER tile: support reporting Tilera hypervisor statistics ...
Diffstat (limited to 'arch/tile/lib/memset_32.c')
-rw-r--r--arch/tile/lib/memset_32.c110
1 files changed, 1 insertions, 109 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff8..2042bfe6595f 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
* more details.
*/
-#include <arch/chip.h>
-
#include <linux/types.h>
#include <linux/string.h>
#include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
void *memset(void *s, int c, size_t n)
{
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
int n32;
uint32_t v16, v32;
uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
- int ahead32;
-#else
int to_align32;
-#endif
/* Experimentation shows that a trivial tight loop is a win up until
* around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
return s;
}
-#if !CHIP_HAS_WH64()
- /* Use a spare issue slot to start prefetching the first cache
- * line early. This instruction is free as the store can be buried
- * in otherwise idle issue slots doing ALU ops.
- */
- __insn_prefetch(out8);
-
- /* We prefetch the end so that a short memset that spans two cache
- * lines gets some prefetching benefit. Again we believe this is free
- * to issue.
- */
- __insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
/* Align 'out8'. We know n >= 3 so this won't write past the end. */
while (((uintptr_t) out8 & 3) != 0) {
*out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
/* This must be at least 8 or the following loop doesn't work. */
#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
-#if !CHIP_HAS_WH64()
-
- ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
- /* We already prefetched the first and last cache lines, so
- * we only need to do more prefetching if we are storing
- * to more than two cache lines.
- */
- if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
- int i;
-
- /* Prefetch the next several cache lines.
- * This is the setup code for the software-pipelined
- * loop below.
- */
-#define MAX_PREFETCH 5
- ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
- if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
- ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
- for (i = CACHE_LINE_SIZE_IN_WORDS;
- i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
- __insn_prefetch(&out32[i]);
- }
-
- if (n32 > ahead32) {
- while (1) {
- int j;
-
- /* Prefetch by reading one word several cache lines
- * ahead. Since loads are non-blocking this will
- * cause the full cache line to be read while we are
- * finishing earlier cache lines. Using a store
- * here causes microarchitectural performance
- * problems where a victimizing store miss goes to
- * the head of the retry FIFO and locks the pipe for
- * a few cycles. So a few subsequent stores in this
- * loop go into the retry FIFO, and then later
- * stores see other stores to the same cache line
- * are already in the retry FIFO and themselves go
- * into the retry FIFO, filling it up and grinding
- * to a halt waiting for the original miss to be
- * satisfied.
- */
- __insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
- n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
- /* Save icache space by only partially unrolling
- * this loop.
- */
- for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
- *out32++ = v32;
- *out32++ = v32;
- *out32++ = v32;
- *out32++ = v32;
- }
-
- /* To save compiled code size, reuse this loop even
- * when we run out of prefetching to do by dropping
- * ahead32 down.
- */
- if (n32 <= ahead32) {
- /* Not even a full cache line left,
- * so stop now.
- */
- if (n32 < CACHE_LINE_SIZE_IN_WORDS)
- break;
-
- /* Choose a small enough value that we don't
- * prefetch past the end. There's no sense
- * in touching cache lines we don't have to.
- */
- ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
- }
- }
- }
-
-#else /* CHIP_HAS_WH64() */
-
/* Determine how many words we need to emit before the 'out32'
* pointer becomes aligned modulo the cache line size.
*/
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
}
-#endif /* CHIP_HAS_WH64() */
-
/* Now handle any leftover values. */
if (n32 != 0) {
do {