diff options
author | Maciej W. Rozycki <macro@orcam.me.uk> | 2022-05-22 21:48:14 +0100 |
---|---|---|
committer | Thomas Bogendoerfer <tsbogend@alpha.franken.de> | 2022-05-23 11:29:59 +0200 |
commit | 84aa85108b1b8af0d1d310e76975f421c5975e66 (patch) | |
tree | 7183b8568c118d9998039cd1008da131e33a1bbc /arch/mips/include | |
parent | 88ca100c885f967c37f3d60d5225a639ed6bd6f1 (diff) | |
download | linux-stable-84aa85108b1b8af0d1d310e76975f421c5975e66.tar.gz linux-stable-84aa85108b1b8af0d1d310e76975f421c5975e66.tar.bz2 linux-stable-84aa85108b1b8af0d1d310e76975f421c5975e66.zip |
MIPS: Rewrite `csum_tcpudp_nofold' in plain C
Recent commit 198688edbf77 ("MIPS: Fix inline asm input/output type
mismatch in checksum.h used with Clang") introduced a code size and
performance regression with 64-bit code emitted for `csum_tcpudp_nofold'
by GCC, caused by a redundant truncation operation produced due to a
data type change made to the variable associated with the inline
assembly's output operand.
The intent previously expressed here with operands and constraints for
optimal code was to have the output operand share a register with one
inputs, both of a different integer type each. This is perfectly valid
with the MIPS psABI where a register can hold integer data of different
types and the assembly code used here makes data stored in the output
register match the data type used with the output operand, however it
has turned out impossible to express this arrangement in source code
such as to satisfy LLVM, apparently due to the compiler's internal
limitations.
There is nothing peculiar about the inline assembly `csum_tcpudp_nofold'
includes however, though it does choose assembly instructions carefully.
Rewrite this piece of assembly in plain C then, using corresponding C
language operations, making GCC produce the same assembly instructions,
possibly shuffled, in the general case and sometimes actually fewer of
them where an input is constant, because the compiler does not have to
reload it to a register (operand constraints could be adjusted for that,
but the plain C approach is cleaner anyway).
Example code size changes are as follows, for a 32-bit configuration:
text data bss total filename
5920480 1347236 126592 7394308 vmlinux-old
5920480 1347236 126592 7394308 vmlinux-now
5919728 1347236 126592 7393556 vmlinux-c
and for a 64-bit configuration:
text data bss total filename
6024112 1790828 225728 8040668 vmlinux-old
6024128 1790828 225728 8040684 vmlinux-now
6023760 1790828 225728 8040316 vmlinux-c
respectively, where "old" is with the commit referred reverted, "now" is
with no change, and "c" is with this change applied.
Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Diffstat (limited to 'arch/mips/include')
-rw-r--r-- | arch/mips/include/asm/checksum.h | 79 |
1 files changed, 38 insertions, 41 deletions
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h index 1e6c1354f245..4044eaf989ac 100644 --- a/arch/mips/include/asm/checksum.h +++ b/arch/mips/include/asm/checksum.h @@ -128,48 +128,45 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, - __wsum sum) + __wsum isum) { - unsigned long tmp = (__force unsigned long)sum; - - __asm__( - " .set push # csum_tcpudp_nofold\n" - " .set noat \n" -#ifdef CONFIG_32BIT - " addu %0, %2 \n" - " sltu $1, %0, %2 \n" - " addu %0, $1 \n" - - " addu %0, %3 \n" - " sltu $1, %0, %3 \n" - " addu %0, $1 \n" - - " addu %0, %4 \n" - " sltu $1, %0, %4 \n" - " addu %0, $1 \n" -#endif -#ifdef CONFIG_64BIT - " daddu %0, %2 \n" - " daddu %0, %3 \n" - " daddu %0, %4 \n" - " dsll32 $1, %0, 0 \n" - " daddu %0, $1 \n" - " sltu $1, %0, $1 \n" - " dsra32 %0, %0, 0 \n" - " addu %0, $1 \n" -#endif - " .set pop" - : "=r" (tmp) - : "0" ((__force unsigned long)daddr), - "r" ((__force unsigned long)saddr), -#ifdef __MIPSEL__ - "r" ((proto + len) << 8), -#else - "r" (proto + len), -#endif - "r" ((__force unsigned long)sum)); - - return (__force __wsum)tmp; + const unsigned int sh32 = IS_ENABLED(CONFIG_64BIT) ? 32 : 0; + unsigned long sum = (__force unsigned long)daddr; + unsigned long tmp; + __u32 osum; + + tmp = (__force unsigned long)saddr; + sum += tmp; + + if (IS_ENABLED(CONFIG_32BIT)) + sum += sum < tmp; + + /* + * We know PROTO + LEN has the sign bit clear, so cast to a signed + * type to avoid an extraneous zero-extension where TMP is 64-bit. + */ + tmp = (__s32)(proto + len); + tmp <<= IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? 8 : 0; + sum += tmp; + if (IS_ENABLED(CONFIG_32BIT)) + sum += sum < tmp; + + tmp = (__force unsigned long)isum; + sum += tmp; + + if (IS_ENABLED(CONFIG_32BIT)) { + sum += sum < tmp; + osum = sum; + } else if (IS_ENABLED(CONFIG_64BIT)) { + tmp = sum << sh32; + sum += tmp; + osum = sum < tmp; + osum += sum >> sh32; + } else { + BUILD_BUG(); + } + + return (__force __wsum)osum; } #define csum_tcpudp_nofold csum_tcpudp_nofold |