diff options
author | Nick Terrell <terrelln@meta.com> | 2025-03-08 12:09:33 -0800 |
---|---|---|
committer | Nick Terrell <terrelln@meta.com> | 2025-03-13 13:25:58 -0700 |
commit | 65d1f5507ed2c78c64fce40e44e5574a9419eb09 (patch) | |
tree | 4a1b819db2ea7f2a322e9bcc7582d946d0a4ea29 /lib/zstd/common/zstd_internal.h | |
parent | 7eb172143d5508b4da468ed59ee857c6e5e01da6 (diff) | |
download | linux-65d1f5507ed2c78c64fce40e44e5574a9419eb09.tar.gz linux-65d1f5507ed2c78c64fce40e44e5574a9419eb09.tar.bz2 linux-65d1f5507ed2c78c64fce40e44e5574a9419eb09.zip |
zstd: Import upstream v1.5.7
In addition to keeping the kernel's copy of zstd up to date, this update
was requested by Intel to expose upstream's APIs that allow QAT to accelerate
the LZ match finding stage of Zstd.
This patch is imported from the upstream tag v1.5.7-kernel [0], which is signed
with upstream's signing key EF8FE99528B52FFD [1]. It was imported from upstream
using this command:
export ZSTD=/path/to/repo/zstd/
export LINUX=/path/to/repo/linux/
cd "$ZSTD/contrib/linux-kernel"
git checkout v1.5.7-kernel
make import LINUX="$LINUX"
This patch has been tested on x86-64, and has been boot tested with
a zstd compressed kernel & initramfs on i386 and aarch64. I benchmarked
the patch on x86-64 with gcc-14.2.1 on an Intel i9-9900K by measruing the
performance of compressed filesystem reads and writes.
Component, Level, Size delta, C. time delta, D. time delta
Btrfs , 1, +0.00%, -6.1%, +1.4%
Btrfs , 3, +0.00%, -9.8%, +3.0%
Btrfs , 5, +0.00%, +1.7%, +1.4%
Btrfs , 7, +0.00%, -1.9%, +2.7%
Btrfs , 9, +0.00%, -3.4%, +3.7%
Btrfs , 15, +0.00%, -0.3%, +3.6%
SquashFS , 1, +0.00%, N/A, +1.9%
The major changes that impact the kernel use cases for each version are:
v1.5.7: https://github.com/facebook/zstd/releases/tag/v1.5.7
* Add zstd_compress_sequences_and_literals() for use by Intel's QAT driver
to implement Zstd compression acceleration in the kernel.
* Fix an underflow bug in 32-bit builds that can cause data corruption when
processing more than 4GB of data with a single `ZSTD_CCtx` object, when an
input crosses the 4GB boundry. I don't believe this impacts any current kernel
use cases, because the `ZSTD_CCtx` is typically reconstructed between
compressions.
* Levels 1-4 see 5-10% compression speed improvements for inputs smaller than
128KB.
v1.5.6: https://github.com/facebook/zstd/releases/tag/v1.5.6
* Improved compression ratio for the highest compression levels. I don't expect
these see much use however, due to their slow speeds.
v1.5.5: https://github.com/facebook/zstd/releases/tag/v1.5.5
* Fix a rare corruption bug that can trigger on levels 13 and above.
* Improve compression speed of levels 5-11 on incompressible data.
v1.5.4: https://github.com/facebook/zstd/releases/tag/v1.5.4
* Improve copmression speed of levels 5-11 on ARM.
* Improve dictionary compression speed.
Signed-off-by: Nick Terrell <terrelln@fb.com>
Diffstat (limited to 'lib/zstd/common/zstd_internal.h')
-rw-r--r-- | lib/zstd/common/zstd_internal.h | 153 |
1 files changed, 14 insertions, 139 deletions
diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h index 93305d9b41bb..52a79435caf6 100644 --- a/lib/zstd/common/zstd_internal.h +++ b/lib/zstd/common/zstd_internal.h @@ -1,5 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* - * Copyright (c) Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -28,12 +29,10 @@ #include <linux/zstd.h> #define FSE_STATIC_LINKING_ONLY #include "fse.h" -#define HUF_STATIC_LINKING_ONLY #include "huf.h" #include <linux/xxhash.h> /* XXH_reset, update, digest */ #define ZSTD_TRACE 0 - /* ---- static assert (debug) --- */ #define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) #define ZSTD_isError ERR_isError /* for inlining */ @@ -83,16 +82,17 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; #define ZSTD_FRAMECHECKSUMSIZE 4 #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ -#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ +#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ +#define MIN_LITERALS_FOR_4_STREAMS 6 -#define HufLog 12 -typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; +typedef enum { set_basic, set_rle, set_compressed, set_repeat } SymbolEncodingType_e; #define LONGNBSEQ 0x7F00 #define MINMATCH 3 #define Litbits 8 +#define LitHufLog 11 #define MaxLit ((1<<Litbits) - 1) #define MaxML 52 #define MaxLL 35 @@ -103,6 +103,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy #define LLFSELog 9 #define OffFSELog 8 #define MaxFSELog MAX(MAX(MLFSELog, LLFSELog), OffFSELog) +#define MaxMLBits 16 +#define MaxLLBits 16 #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */ /* Each table cannot take more than #symbols * FSELog bits */ @@ -166,7 +168,7 @@ static void ZSTD_copy8(void* dst, const void* src) { ZSTD_memcpy(dst, src, 8); #endif } -#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; } +#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0) /* Need to use memmove here since the literal buffer can now be located within the dst buffer. In circumstances where the op "catches up" to where the @@ -186,7 +188,7 @@ static void ZSTD_copy16(void* dst, const void* src) { ZSTD_memcpy(dst, copy16_buf, 16); #endif } -#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; } +#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0) #define WILDCOPY_OVERLENGTH 32 #define WILDCOPY_VECLEN 16 @@ -215,7 +217,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { /* Handle short offset copies. */ do { - COPY8(op, ip) + COPY8(op, ip); } while (op < oend); } else { assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); @@ -225,12 +227,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e * one COPY16() in the first call. Then, do two calls per loop since * at that point it is more likely to have a high trip count. */ -#ifdef __aarch64__ - do { - COPY16(op, ip); - } - while (op < oend); -#else ZSTD_copy16(op, ip); if (16 >= length) return; op += 16; @@ -240,7 +236,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e COPY16(op, ip); } while (op < oend); -#endif } } @@ -273,62 +268,6 @@ typedef enum { /*-******************************************* * Private declarations *********************************************/ -typedef struct seqDef_s { - U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ - U16 litLength; - U16 mlBase; /* mlBase == matchLength - MINMATCH */ -} seqDef; - -/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */ -typedef enum { - ZSTD_llt_none = 0, /* no longLengthType */ - ZSTD_llt_literalLength = 1, /* represents a long literal */ - ZSTD_llt_matchLength = 2 /* represents a long match */ -} ZSTD_longLengthType_e; - -typedef struct { - seqDef* sequencesStart; - seqDef* sequences; /* ptr to end of sequences */ - BYTE* litStart; - BYTE* lit; /* ptr to end of literals */ - BYTE* llCode; - BYTE* mlCode; - BYTE* ofCode; - size_t maxNbSeq; - size_t maxNbLit; - - /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength - * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment - * the existing value of the litLength or matchLength by 0x10000. - */ - ZSTD_longLengthType_e longLengthType; - U32 longLengthPos; /* Index of the sequence to apply long length modification to */ -} seqStore_t; - -typedef struct { - U32 litLength; - U32 matchLength; -} ZSTD_sequenceLength; - -/* - * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences - * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. - */ -MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) -{ - ZSTD_sequenceLength seqLen; - seqLen.litLength = seq->litLength; - seqLen.matchLength = seq->mlBase + MINMATCH; - if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { - if (seqStore->longLengthType == ZSTD_llt_literalLength) { - seqLen.litLength += 0xFFFF; - } - if (seqStore->longLengthType == ZSTD_llt_matchLength) { - seqLen.matchLength += 0xFFFF; - } - } - return seqLen; -} /* * Contains the compressed frame size and an upper-bound for the decompressed frame size. @@ -337,74 +276,11 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` */ typedef struct { + size_t nbBlocks; size_t compressedSize; unsigned long long decompressedBound; } ZSTD_frameSizeInfo; /* decompress & legacy */ -const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ -void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ - -/* custom memory allocation functions */ -void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); -void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); -void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); - - -MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ -{ - assert(val != 0); - { -# if (__GNUC__ >= 3) /* GCC Intrinsic */ - return __builtin_clz (val) ^ 31; -# else /* Software version */ - static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; - U32 v = val; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; -# endif - } -} - -/* - * Counts the number of trailing zeros of a `size_t`. - * Most compilers should support CTZ as a builtin. A backup - * implementation is provided if the builtin isn't supported, but - * it may not be terribly efficient. - */ -MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) -{ - if (MEM_64bits()) { -# if (__GNUC__ >= 4) - return __builtin_ctzll((U64)val); -# else - static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, - 4, 25, 14, 28, 9, 34, 20, 56, - 5, 17, 26, 54, 15, 41, 29, 43, - 10, 31, 38, 35, 21, 45, 49, 57, - 63, 6, 12, 18, 24, 27, 33, 55, - 16, 53, 40, 42, 30, 37, 44, 48, - 62, 11, 23, 32, 52, 39, 36, 47, - 61, 22, 51, 46, 60, 50, 59, 58 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } else { /* 32 bits */ -# if (__GNUC__ >= 3) - return __builtin_ctz((U32)val); -# else - static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, - 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, - 26, 12, 18, 6, 11, 5, 10, 9 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } -} - - /* ZSTD_invalidateRepCodes() : * ensures next compression will not use repcodes from previous block. * Note : only works with regular variant; @@ -420,13 +296,13 @@ typedef struct { /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: decompress, fullbench */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr); /*! ZSTD_decodeSeqHeaders() : * decode sequence header from src */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: zstd_decompress_block, fullbench */ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize); @@ -439,5 +315,4 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void) return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); } - #endif /* ZSTD_CCOMMON_H_MODULE */ |