/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ #include /* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and then narrow down, why we don't need to worry about endianness, etc. */ uint16_t ipchksum(const void *data, size_t size) { const uint8_t *p1 = data; unsigned long wide_sum = 0; uint32_t sum = 0; size_t i = 0; #if defined(__aarch64__) size_t size16 = size / 16; const uint64_t *p8 = data; if (size16) { unsigned long tmp1, tmp2; i = size16 * 16; asm ( "adds xzr, xzr, xzr\n\t" /* init carry flag for addition */ "1:\n\t" "ldp %[v1], %[v2], [%[p8]], #16\n\t" "adcs %[wsum], %[wsum], %[v1]\n\t" "adcs %[wsum], %[wsum], %[v2]\n\t" "sub %[size16], %[size16], #1\n\t" "cbnz %[size16], 1b\n\t" "adcs %[wsum], %[wsum], xzr\n\t" /* use up last carry */ : [v1] "=r" (tmp1), [v2] "=r" (tmp2), [wsum] "+r" (wide_sum), [p8] "+r" (p8), [size16] "+r" (size16) :: "cc" ); } #elif defined(__i386__) || defined(__x86_64__) size_t size8 = size / 8; const uint64_t *p8 = data; i = size8 * 8; asm ( "clc\n\t" "1:\n\t" "jecxz 2f\n\t" /* technically RCX on 64, but not gonna be that big */ "adc (%[p8]), %[wsum]\n\t" #if defined(__i386__) "adc 4(%[p8]), %[wsum]\n\t" #endif /* __i386__ */ "lea -1(%[size8]), %[size8]\n\t" /* Use LEA as a makeshift ADD that */ "lea 8(%[p8]), %[p8]\n\t" /* doesn't modify the carry flag. */ "jmp 1b\n\t" "2:\n\t" "setc %b[size8]\n\t" /* reuse size register to save last carry */ "add %[size8], %[wsum]\n\t" : [wsum] "+r" (wide_sum), [p8] "+r" (p8), [size8] "+c" (size8) /* put size in ECX so we can JECXZ */ :: "cc" ); #endif /* __i386__ || __x86_64__ */ while (wide_sum) { sum += wide_sum & 0xFFFF; wide_sum >>= 16; } sum = (sum & 0xFFFF) + (sum >> 16); for (; i < size; i++) { uint32_t v = p1[i]; if (i % 2) v <<= 8; sum += v; /* Doing this unconditionally seems to be faster. */ sum = (sum & 0xFFFF) + (sum >> 16); } return (uint16_t)~sum; } uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second) { first = ~first; second = ~second; /* * Since the checksum is calculated in 16-bit chunks, if the offset at which * the data covered by the second checksum would start (if both data streams * came one after the other) is odd, that means the second stream starts in * the middle of a 16-bit chunk. This means the second checksum is byte * swapped compared to what we need it to be, and we must swap it back. */ if (offset % 2) second = (second >> 8) | (second << 8); uint32_t sum = first + second; sum = (sum & 0xFFFF) + (sum >> 16); return (uint16_t)~sum; }