From 76551468833cd5c356b1d9ff4bc9393fcf768a59 Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <eugeniy.paltsev@synopsys.com>
Date: Wed, 30 Jan 2019 19:32:41 +0300
Subject: ARCv2: Add explcit unaligned access support (and ability to disable
 too)

As of today we enable unaligned access unconditionally on ARCv2.
Do this under a Kconfig option to allow disable it for test, benchmarking
etc. Also while at it

  - Select HAVE_EFFICIENT_UNALIGNED_ACCESS
  - Although gcc defaults to unaligned access (since GNU 2018.03), add the
    right toggles for enabling or disabling as appropriate
  - update bootlog to prints both HW feature status (exists, enabled/disabled)
    and SW status (used / not used).
  - wire up the relaxed memcpy for unaligned access

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
[vgupta: squashed patches, handle gcc -mno-unaligned-access quick]
---
 arch/arc/include/asm/arcregs.h        | 1 +
 arch/arc/include/asm/irqflags-arcv2.h | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/arc/include')

diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index a27eafdc8260..0c13317af1d6 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -82,6 +82,7 @@
 #define ECR_V_DTLB_MISS			0x05
 #define ECR_V_PROTV			0x06
 #define ECR_V_TRAP			0x09
+#define ECR_V_MISALIGN			0x0d
 #endif
 
 /* DTLB Miss and Protection Violation Cause Codes */
diff --git a/arch/arc/include/asm/irqflags-arcv2.h b/arch/arc/include/asm/irqflags-arcv2.h
index 8a4f77ea3238..e66d0339e1d8 100644
--- a/arch/arc/include/asm/irqflags-arcv2.h
+++ b/arch/arc/include/asm/irqflags-arcv2.h
@@ -44,7 +44,13 @@
 #define ARCV2_IRQ_DEF_PRIO	1
 
 /* seed value for status register */
-#define ISA_INIT_STATUS_BITS	(STATUS_IE_MASK | STATUS_AD_MASK | \
+#ifdef CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS
+#define __AD_ENB	STATUS_AD_MASK
+#else
+#define __AD_ENB	0
+#endif
+
+#define ISA_INIT_STATUS_BITS	(STATUS_IE_MASK | __AD_ENB | \
 					(ARCV2_IRQ_DEF_PRIO << 1))
 
 #ifndef __ASSEMBLY__
-- 
cgit v1.2.3


From fbe025c3eaf5f52060c2820eddb7357363db0d27 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 22 Feb 2019 10:42:44 -0800
Subject: ARC: perf: bpok condition only exists for ARCompact

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/include/asm/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arc/include')

diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index 6958545390f0..9cd7ee4fad39 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -105,10 +105,10 @@ static const char * const arc_pmu_ev_hw_map[] = {
 	[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
 	/* All jump instructions that are taken */
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmptak",
-	[PERF_COUNT_ARC_BPOK]         = "bpok",	  /* NP-NT, PT-T, PNT-NT */
 #ifdef CONFIG_ISA_ARCV2
 	[PERF_COUNT_HW_BRANCH_MISSES] = "bpmp",
 #else
+	[PERF_COUNT_ARC_BPOK]         = "bpok",	  /* NP-NT, PT-T, PNT-NT */
 	[PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */
 #endif
 	[PERF_COUNT_ARC_LDC] = "imemrdc",	/* Instr: mem read cached */
-- 
cgit v1.2.3


From 00a4ae65cc600b008c80429a4fa37ccee21f139e Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Mon, 25 Feb 2019 11:24:05 -0800
Subject: ARCv2: boot log: refurbish HS core/release identification

HS core names and releases have so far been identified based solely on
IDENTIFY.ARCVER field. With the future HS releases this will not
be sufficient as same ARCVER 0x54 could be an HS38 or HS48.

So rewrite the code to use a new BCR to identify the cores properly.

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/include/asm/arcregs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arc/include')

diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index 0c13317af1d6..36e5d124f5ae 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -313,7 +313,7 @@ struct cpuinfo_arc {
 	struct cpuinfo_arc_bpu bpu;
 	struct bcr_identity core;
 	struct bcr_isa_arcv2 isa;
-	const char *details, *name;
+	const char *release, *name;
 	unsigned int vec_base;
 	struct cpuinfo_arc_ccm iccm, dccm;
 	struct {
-- 
cgit v1.2.3


From 85d6adcbbe6dfc557755543c6c39b497d3032cdc Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Mon, 25 Feb 2019 11:56:28 -0800
Subject: ARC: boot log: cut down on verbosity

The syscall ABI has long been fixed, so no need to call that out now.

Also, there's no need to print really fine details such as norm,
barrel-shifter etc. Those are given in a Linux enabled hardware config.
So now we print just 1 line for all optional "instruction" related
hardware features

|
| ISA Extn	: atomic ll64 unalign mpy[opt 9] div_rem

vs. 2 before

|
|ISA Extn	: atomic ll64 unalign
|		: mpy[opt 9] div_rem norm barrel-shift swap minmax swape

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/include/asm/arcregs.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'arch/arc/include')

diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index 36e5d124f5ae..a7d4be87b2f0 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -168,14 +168,6 @@ struct bcr_mpy {
 #endif
 };
 
-struct bcr_extn_xymem {
-#ifdef CONFIG_CPU_BIG_ENDIAN
-	unsigned int ram_org:2, num_banks:4, bank_sz:4, ver:8;
-#else
-	unsigned int ver:8, bank_sz:4, num_banks:4, ram_org:2;
-#endif
-};
-
 struct bcr_iccm_arcompact {
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	unsigned int base:16, pad:5, sz:3, ver:8;
@@ -323,7 +315,6 @@ struct cpuinfo_arc {
 			     timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4;
 	} extn;
 	struct bcr_mpy extn_mpy;
-	struct bcr_extn_xymem extn_xymem;
 };
 
 extern struct cpuinfo_arc cpuinfo_arc700[];
-- 
cgit v1.2.3


From 3032f0c9008088a3effdc2622ce16c3e1bcb13a2 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Thu, 7 Mar 2019 13:29:59 -0800
Subject: ARCv2: spinlock: remove the extra smp_mb before lock, after unlock

 - ARCv2 LLSC spinlocks have smp_mb() both before and after the LLSC
   instructions, which is not required per lkmm ACQ/REL semantics.
   smp_mb() is only needed _after_ lock and _before_ unlock.
   So remove the extra barriers.
   The reason they were there was mainly historical. At the time of
   initial SMP Linux bringup on HS38 cores, I was too conservative,
   given the fluidity of both hw and sw. The last attempt to ditch the
   extra barrier showed some hackbench regression which is apparently
   not the case now (atleast for LLSC case, read on...)

 - EX based spinlocks (!CONFIG_ARC_HAS_LLSC) still needs the extra
   smp_mb(), not due to lkmm, but due to some hardware shenanigans.
   W/o that, hackbench triggers RCU stall splat so extra DMB is retained
   !LLSC based systems are not realistic Linux sstem anyways so they can
   afford to be a nit suboptimal ;-)

   | [ARCLinux]# for i in (seq 1 1 5) ; do hackbench; done
   | Running with 10 groups 400 process
   | INFO: task hackbench:158 blocked for more than 10 seconds.
   |       Not tainted 4.20.0-00005-g96b18288a88e-dirty #117
   | "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
   | hackbench       D    0   158    135 0x00000000
   |
   | Stack Trace:
   | watchdog: BUG: soft lockup - CPU#3 stuck for 59s! [hackbench:469]
   | Modules linked in:
   | Path: (null)
   | CPU: 3 PID: 469 Comm: hackbench Not tainted 4.20.0-00005-g96b18288a88e-dirty
   |
   | [ECR   ]: 0x00000000 => Check Programmer's Manual
   | [EFA   ]: 0x00000000
   | [BLINK ]: do_exit+0x4a6/0x7d0
   | [ERET  ]: _raw_write_unlock_irq+0x44/0x5c

 - And while at it, remove the extar smp_mb() from EX based
   arch_read_trylock() since the spin lock there guarantees a full
   barrier anyways

 - For LLSC case, hackbench threads improves with this patch (HAPS @ 50MHz)

   ---- before ----
   |
   | [ARCLinux]# for i in 1 2 3 4 5; do hackbench 10 thread; done
   | Running with 10 groups 400 threads
   | Time: 16.253
   | Time: 16.445
   | Time: 16.590
   | Time: 16.721
   | Time: 16.544

   ---- after ----
   |
   | [ARCLinux]# for i in 1 2 3 4 5; do hackbench 10 thread; done
   | Running with 10 groups 400 threads
   | Time: 15.638
   | Time: 15.730
   | Time: 15.870
   | Time: 15.842
   | Time: 15.729

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/include/asm/spinlock.h | 49 ++++++++++++-----------------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

(limited to 'arch/arc/include')

diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index 2ba04a7db621..daa914da7968 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -21,8 +21,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	unsigned int val;
 
-	smp_mb();
-
 	__asm__ __volatile__(
 	"1:	llock	%[val], [%[slock]]	\n"
 	"	breq	%[val], %[LOCKED], 1b	\n"	/* spin while LOCKED */
@@ -34,6 +32,14 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 	  [LOCKED]	"r"	(__ARCH_SPIN_LOCK_LOCKED__)
 	: "memory", "cc");
 
+	/*
+	 * ACQUIRE barrier to ensure load/store after taking the lock
+	 * don't "bleed-up" out of the critical section (leak-in is allowed)
+	 * http://www.spinics.net/lists/kernel/msg2010409.html
+	 *
+	 * ARCv2 only has load-load, store-store and all-all barrier
+	 * thus need the full all-all barrier
+	 */
 	smp_mb();
 }
 
@@ -42,8 +48,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
 	unsigned int val, got_it = 0;
 
-	smp_mb();
-
 	__asm__ __volatile__(
 	"1:	llock	%[val], [%[slock]]	\n"
 	"	breq	%[val], %[LOCKED], 4f	\n"	/* already LOCKED, just bail */
@@ -67,9 +71,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	smp_mb();
 
-	lock->slock = __ARCH_SPIN_LOCK_UNLOCKED__;
-
-	smp_mb();
+	WRITE_ONCE(lock->slock, __ARCH_SPIN_LOCK_UNLOCKED__);
 }
 
 /*
@@ -81,8 +83,6 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	unsigned int val;
 
-	smp_mb();
-
 	/*
 	 * zero means writer holds the lock exclusively, deny Reader.
 	 * Otherwise grant lock to first/subseq reader
@@ -113,8 +113,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 {
 	unsigned int val, got_it = 0;
 
-	smp_mb();
-
 	__asm__ __volatile__(
 	"1:	llock	%[val], [%[rwlock]]	\n"
 	"	brls	%[val], %[WR_LOCKED], 4f\n"	/* <= 0: already write locked, bail */
@@ -140,8 +138,6 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 {
 	unsigned int val;
 
-	smp_mb();
-
 	/*
 	 * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__),
 	 * deny writer. Otherwise if unlocked grant to writer
@@ -175,8 +171,6 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 {
 	unsigned int val, got_it = 0;
 
-	smp_mb();
-
 	__asm__ __volatile__(
 	"1:	llock	%[val], [%[rwlock]]	\n"
 	"	brne	%[val], %[UNLOCKED], 4f	\n"	/* !UNLOCKED, bail */
@@ -217,17 +211,13 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 	: [val]		"=&r"	(val)
 	: [rwlock]	"r"	(&(rw->counter))
 	: "memory", "cc");
-
-	smp_mb();
 }
 
 static inline void arch_write_unlock(arch_rwlock_t *rw)
 {
 	smp_mb();
 
-	rw->counter = __ARCH_RW_LOCK_UNLOCKED__;
-
-	smp_mb();
+	WRITE_ONCE(rw->counter, __ARCH_RW_LOCK_UNLOCKED__);
 }
 
 #else	/* !CONFIG_ARC_HAS_LLSC */
@@ -237,10 +227,9 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 	unsigned int val = __ARCH_SPIN_LOCK_LOCKED__;
 
 	/*
-	 * This smp_mb() is technically superfluous, we only need the one
-	 * after the lock for providing the ACQUIRE semantics.
-	 * However doing the "right" thing was regressing hackbench
-	 * so keeping this, pending further investigation
+	 * Per lkmm, smp_mb() is only required after _lock (and before_unlock)
+	 * for ACQ and REL semantics respectively. However EX based spinlocks
+	 * need the extra smp_mb to workaround a hardware quirk.
 	 */
 	smp_mb();
 
@@ -257,14 +246,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 #endif
 	: "memory");
 
-	/*
-	 * ACQUIRE barrier to ensure load/store after taking the lock
-	 * don't "bleed-up" out of the critical section (leak-in is allowed)
-	 * http://www.spinics.net/lists/kernel/msg2010409.html
-	 *
-	 * ARCv2 only has load-load, store-store and all-all barrier
-	 * thus need the full all-all barrier
-	 */
 	smp_mb();
 }
 
@@ -309,8 +290,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	: "memory");
 
 	/*
-	 * superfluous, but keeping for now - see pairing version in
-	 * arch_spin_lock above
+	 * see pairing version/comment in arch_spin_lock above
 	 */
 	smp_mb();
 }
@@ -344,7 +324,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	arch_spin_unlock(&(rw->lock_mutex));
 	local_irq_restore(flags);
 
-	smp_mb();
 	return ret;
 }
 
-- 
cgit v1.2.3