cbfs: Add LZ4 in-place decompression support for pre-RAM stages

This patch ports the LZ4 decompression code that debuted in libpayload last year to coreboot for use in CBFS stages (upgrading the base algorithm to LZ4's dev branch to access the new in-place decompression checks). This is especially useful for pre-RAM stages in constrained SRAM-based systems, which previously could not be compressed due to the size requirements of the LZMA scratchpad and bounce buffer. The LZ4 algorithm offers a very lean decompressor function and in-place decompression support to achieve roughly the same boot speed gains (trading compression ratio for decompression time) with nearly no memory overhead. For now we only activate it for the stages that had previously not been compressed at all on non-XIP (read: non-x86) boards. In the future we may also consider replacing LZMA completely for certain boards, since which algorithm wins out on boot speed depends on board-specific parameters (architecture, processor speed, SPI transfer rate, etc.). BRANCH=None BUG=None TEST=Built and booted Oak, Jerry, Nyan and Falco. Measured boot time on Oak to be about ~20ms faster (cutting load times for affected stages almost in half). Change-Id: Iec256c0e6d585d1b69985461939884a54e3ab900 Signed-off-by: Julius Werner <jwerner@chromium.org> Reviewed-on: https://review.coreboot.org/13638 Tested-by: build bot (Jenkins) Reviewed-by: Aaron Durbin <adurbin@chromium.org>
author: Julius Werner <jwerner@chromium.org> 2015-09-29 13:51:35 -0700
committer: Julius Werner <jwerner@chromium.org> 2016-02-22 21:38:37 +0100
commit: 09f2921b5dacaf79b391652cecd606be4dd69f50 (patch)
tree: 581236de48a22240cb1b823195a79493f037fb64 /payloads
parent: 0e3d7de7410c4ff7c9465261b58524675a0329e2 (diff)
download: coreboot-09f2921b5dacaf79b391652cecd606be4dd69f50.tar.gz
coreboot-09f2921b5dacaf79b391652cecd606be4dd69f50.tar.bz2
coreboot-09f2921b5dacaf79b391652cecd606be4dd69f50.zip
6 files changed, 116 insertions, 63 deletions
diff --git a/payloads/libpayload/include/cbfs_core.h b/payloads/libpayload/include/cbfs_core.h
index 4c59f4131a73..4cbc4c0628ea 100644
--- a/payloads/libpayload/include/cbfs_core.h
+++ b/payloads/libpayload/include/cbfs_core.h
@@ -58,6 +58,7 @@
 
 #define CBFS_COMPRESS_NONE  0
 #define CBFS_COMPRESS_LZMA  1
+#define CBFS_COMPRESS_LZ4   2
 
 /** These are standard component types for well known
     components (i.e - those that coreboot needs to consume.
diff --git a/payloads/libpayload/include/lz4.h b/payloads/libpayload/include/lz4.h
index 1f2830db4649..d2120a48fcbb 100644
--- a/payloads/libpayload/include/lz4.h
+++ b/payloads/libpayload/include/lz4.h
@@ -36,7 +36,10 @@
 
 /* Decompresses an LZ4F image (multiple LZ4 blocks with frame header) from src
  * to dst, ensuring that it doesn't read more than srcn bytes and doesn't write
- * more than dstn. Buffer sizes must stay below 2GB.
+ * more than dstn. Buffer sizes must stay below 2GB. Can decompress files loaded
+ * to the end of a buffer in-place, as long as buffer is larger than the final
+ * output size. (Usually just a few bytes, but may be up to (8 + dstn/255) in
+ * worst case. Will reliably return an error if buffer was too small.)
  * Returns amount of decompressed bytes, or 0 on error.
  */
 size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn);
@@ -44,4 +47,4 @@ size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn);
 /* Same as ulz4fn() but does not perform any bounds checks. */
 size_t ulz4f(const void *src, void *dst);
 
-#endif /* __LZO_H_ */
+#endif /* __LZ4_H_ */
diff --git a/payloads/libpayload/libcbfs/cbfs.c b/payloads/libpayload/libcbfs/cbfs.c
index 49e4941181dc..a1cc7e443cd0 100644
--- a/payloads/libpayload/libcbfs/cbfs.c
+++ b/payloads/libpayload/libcbfs/cbfs.c
@@ -35,11 +35,16 @@
 #  include <lzma.h>
 #  define CBFS_CORE_WITH_LZMA
 # endif
+# if IS_ENABLED(CONFIG_LP_LZ4)
+#  include <lz4.h>
+#  define CBFS_CORE_WITH_LZ4
+# endif
 # define CBFS_MINI_BUILD
 #elif defined(__SMM__)
 # define CBFS_MINI_BUILD
 #else
 # define CBFS_CORE_WITH_LZMA
+# define CBFS_CORE_WITH_LZ4
 # include <lib.h>
 #endif
 
diff --git a/payloads/libpayload/libcbfs/cbfs_core.c b/payloads/libpayload/libcbfs/cbfs_core.c
index ddf0da5f4218..c32d262b3361 100644
--- a/payloads/libpayload/libcbfs/cbfs_core.c
+++ b/payloads/libpayload/libcbfs/cbfs_core.c
@@ -34,6 +34,9 @@
  * CBFS_CORE_WITH_LZMA (must be #define)
  *      if defined, ulzma() must exist for decompression of data streams
  *
+ * CBFS_CORE_WITH_LZ4 (must be #define)
+ *      if defined, ulz4f() must exist for decompression of data streams
+ *
  * ERROR(x...)
  *      print an error message x (in printf format)
  *
@@ -330,6 +333,10 @@ int cbfs_decompress(int algo, void *src, void *dst, int len)
 		case CBFS_COMPRESS_LZMA:
 			return ulzma(src, dst);
 #endif
+#ifdef CBFS_CORE_WITH_LZ4
+		case CBFS_COMPRESS_LZ4:
+			return ulz4f(src, dst);
+#endif
 		default:
 			ERROR("tried to decompress %d bytes with algorithm #%x,"
 			      "but that algorithm id is unsupported.\n", len,
diff --git a/payloads/libpayload/liblz4/lz4.c b/payloads/libpayload/liblz4/lz4.c.inc
index fb89090ee28e..b3be4e5b443b 100644
--- a/payloads/libpayload/liblz4/lz4.c
+++ b/payloads/libpayload/liblz4/lz4.c.inc
@@ -37,12 +37,19 @@
 *  Reading and writing into memory
 **************************************/
 
-/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
+/* customized variant of memcpy, which can overwrite up to 7 bytes beyond dstEnd */
 static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
 {
     BYTE* d = (BYTE*)dstPtr;
     const BYTE* s = (const BYTE*)srcPtr;
-    BYTE* e = (BYTE*)dstEnd;
+    BYTE* const e = (BYTE*)dstEnd;
+
+#if 0
+    const size_t l2 = 8 - (((size_t)d) & (sizeof(void*)-1));
+    LZ4_copy8(d,s); if (d>e-9) return;
+    d+=l2; s+=l2;
+#endif /* join to align */
+
     do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
 }
 
@@ -52,9 +59,9 @@ static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
 **************************************/
 #define MINMATCH 4
 
-#define COPYLENGTH 8
+#define WILDCOPYLENGTH 8
 #define LASTLITERALS 5
-#define MFLIMIT (COPYLENGTH+MINMATCH)
+#define MFLIMIT (WILDCOPYLENGTH+MINMATCH)
 static const int LZ4_minLength = (MFLIMIT+1);
 
 #define KB *(1 <<10)
@@ -114,11 +121,12 @@ FORCE_INLINE int LZ4_decompress_generic(
     const BYTE* const lowLimit = lowPrefix - dictSize;
 
     const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
-    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
-    const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+    const unsigned dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
+    const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
 
     const int safeDecode = (endOnInput==endOnInputSize);
     const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+    const int inPlaceDecode = ((ip >= op) && (ip < oend));
 
 
     /* Special cases */
@@ -133,6 +141,9 @@ FORCE_INLINE int LZ4_decompress_generic(
         unsigned token;
         size_t length;
         const BYTE* match;
+        size_t offset;
+
+        if (unlikely((inPlaceDecode) && (op + WILDCOPYLENGTH > ip))) goto _output_error;   /* output stream ran over input stream */
 
         /* get literal length */
         token = *ip++;
@@ -144,7 +155,7 @@ FORCE_INLINE int LZ4_decompress_generic(
                 s = *ip++;
                 length += s;
             }
-            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
+            while ( likely(endOnInput ? ip<iend-RUN_MASK : 1) && (s==255) );
             if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
             if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
         }
@@ -152,7 +163,7 @@ FORCE_INLINE int LZ4_decompress_generic(
         /* copy literals */
         cpy = op+length;
         if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
-            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+            || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)))
         {
             if (partialDecoding)
             {
@@ -164,7 +175,7 @@ FORCE_INLINE int LZ4_decompress_generic(
                 if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
                 if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
             }
-            memcpy(op, ip, length);
+            memmove(op, ip, length);
             ip += length;
             op += length;
             break;     /* Necessarily EOF, due to parsing restrictions */
@@ -173,8 +184,9 @@ FORCE_INLINE int LZ4_decompress_generic(
         ip += length; op = cpy;
 
         /* get offset */
-        match = cpy - LZ4_readLE16(ip); ip+=2;
-        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
+        offset = LZ4_readLE16(ip); ip+=2;
+        match = op - offset;
+        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside buffers */
 
         /* get matchlength */
         length = token & ML_MASK;
@@ -204,12 +216,12 @@ FORCE_INLINE int LZ4_decompress_generic(
             }
             else
             {
-                /* match encompass external dictionary and current segment */
+                /* match encompass external dictionary and current block */
                 size_t copySize = (size_t)(lowPrefix-match);
                 memcpy(op, dictEnd - copySize, copySize);
                 op += copySize;
                 copySize = length - copySize;
-                if (copySize > (size_t)(op-lowPrefix))   /* overlap within current segment */
+                if (copySize > (size_t)(op-lowPrefix))   /* overlap copy */
                 {
                     BYTE* const endOfMatch = op + copySize;
                     const BYTE* copyFrom = lowPrefix;
@@ -224,28 +236,30 @@ FORCE_INLINE int LZ4_decompress_generic(
             continue;
         }
 
-        /* copy repeated sequence */
+        /* copy match within block */
         cpy = op + length;
-        if (unlikely((op-match)<8))
+        if (unlikely(offset<8))
         {
-            const size_t dec64 = dec64table[op-match];
+            const int dec64 = dec64table[offset];
             op[0] = match[0];
             op[1] = match[1];
             op[2] = match[2];
             op[3] = match[3];
-            match += dec32table[op-match];
-            LZ4_copy4(op+4, match);
-            op += 8; match -= dec64;
-        } else { LZ4_copy8(op, match); op+=8; match+=8; }
+            match += dec32table[offset];
+            memcpy(op+4, match, 4);
+            match -= dec64;
+        } else { LZ4_copy8(op, match); match+=8; }
+        op += 8;
 
         if (unlikely(cpy>oend-12))
         {
-            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals */
-            if (op < oend-8)
+            BYTE* const oCopyLimit = oend-(WILDCOPYLENGTH-1);
+            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+            if (op < oCopyLimit)
             {
-                LZ4_wildCopy(op, match, oend-8);
-                match += (oend-8) - op;
-                op = oend-8;
+                LZ4_wildCopy(op, match, oCopyLimit);
+                match += oCopyLimit - op;
+                op = oCopyLimit;
             }
             while (op<cpy) *op++ = *match++;
         }
diff --git a/payloads/libpayload/liblz4/lz4_wrapper.c b/payloads/libpayload/liblz4/lz4_wrapper.c
index 431fb55cc0b2..6de140e403ce 100644
--- a/payloads/libpayload/liblz4/lz4_wrapper.c
+++ b/payloads/libpayload/liblz4/lz4_wrapper.c
@@ -29,7 +29,6 @@
  * SUCH DAMAGE.
  */
 
-#include <assert.h>
 #include <endian.h>
 #include <libpayload.h>
 #include <lz4.h>
@@ -38,9 +37,28 @@
  * seem to be very inefficient in practice (at least on ARM64). Since libpayload
  * knows about endinaness and allows some basic assumptions (such as unaligned
  * access support), we can easily write the ones we need ourselves. */
-static u16 LZ4_readLE16(const void *src) { return le16toh(*(u16 *)src); }
-static void LZ4_copy4(void *dst, const void *src) { *(u32 *)dst = *(u32 *)src; }
-static void LZ4_copy8(void *dst, const void *src) { *(u64 *)dst = *(u64 *)src; }
+static uint16_t LZ4_readLE16(const void *src)
+{
+	return le16toh(*(uint16_t *)src);
+}
+static void LZ4_copy8(void *dst, const void *src)
+{
+/* ARM32 needs to be a special snowflake to prevent GCC from coalescing the
+ * access into LDRD/STRD (which don't support unaligned accesses). */
+#ifdef __arm__
+	uint32_t x0, x1;
+	asm volatile (
+		"ldr %[x0], [%[src]]\n\t"
+		"ldr %[x1], [%[src], #4]\n\t"
+		"str %[x0], [%[dst]]\n\t"
+		"str %[x1], [%[dst], #4]\n\t"
+		: [x0]"=r"(x0), [x1]"=r"(x1)
+		: [src]"r"(src), [dst]"r"(dst)
+		: "memory" );
+#else
+	*(uint64_t *)dst = *(const uint64_t *)src;
+#endif
+}
 
 typedef  uint8_t BYTE;
 typedef uint16_t U16;
@@ -52,58 +70,59 @@ typedef uint64_t U64;
 #define likely(expr) __builtin_expect((expr) != 0, 1)
 #define unlikely(expr) __builtin_expect((expr) != 0, 0)
 
-/* Unaltered (except removing unrelated code) from github.com/Cyan4973/lz4. */
-#include "lz4.c"	/* #include for inlining, do not link! */
+/* Unaltered (just removed unrelated code) from github.com/Cyan4973/lz4/dev. */
+#include "lz4.c.inc"	/* #include for inlining, do not link! */
 
 #define LZ4F_MAGICNUMBER 0x184D2204
 
 struct lz4_frame_header {
-	u32 magic;
+	uint32_t magic;
 	union {
-		u8 flags;
+		uint8_t flags;
 		struct {
-			u8 reserved0		: 2;
-			u8 has_content_checksum	: 1;
-			u8 has_content_size	: 1;
-			u8 has_block_checksum	: 1;
-			u8 independent_blocks	: 1;
-			u8 version		: 2;
+			uint8_t reserved0		: 2;
+			uint8_t has_content_checksum	: 1;
+			uint8_t has_content_size	: 1;
+			uint8_t has_block_checksum	: 1;
+			uint8_t independent_blocks	: 1;
+			uint8_t version			: 2;
 		};
 	};
 	union {
-		u8 block_descriptor;
+		uint8_t block_descriptor;
 		struct {
-			u8 reserved1		: 4;
-			u8 max_block_size	: 3;
-			u8 reserved2		: 1;
+			uint8_t reserved1		: 4;
+			uint8_t max_block_size		: 3;
+			uint8_t reserved2		: 1;
 		};
 	};
-	/* + u64 content_size iff has_content_size is set */
-	/* + u8 header_checksum */
+	/* + uint64_t content_size iff has_content_size is set */
+	/* + uint8_t header_checksum */
 } __attribute__((packed));
 
 struct lz4_block_header {
 	union {
-		u32 raw;
+		uint32_t raw;
 		struct {
-			u32 size		: 31;
-			u32 not_compressed	: 1;
+			uint32_t size		: 31;
+			uint32_t not_compressed	: 1;
 		};
 	};
 	/* + size bytes of data */
-	/* + u32 block_checksum iff has_block_checksum is set */
+	/* + uint32_t block_checksum iff has_block_checksum is set */
 } __attribute__((packed));
 
 size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn)
 {
 	const void *in = src;
 	void *out = dst;
+	size_t out_size = 0;
 	int has_block_checksum;
 
 	{ /* With in-place decompression the header may become invalid later. */
 		const struct lz4_frame_header *h = in;
 
-		if (srcn < sizeof(*h) + sizeof(u64) + sizeof(u8))
+		if (srcn < sizeof(*h) + sizeof(uint64_t) + sizeof(uint8_t))
 			return 0;	/* input overrun */
 
 		/* We assume there's always only a single, standard frame. */
@@ -117,25 +136,27 @@ size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn)
 
 		in += sizeof(*h);
 		if (h->has_content_size)
-			in += sizeof(u64);
-		in += sizeof(u8);
+			in += sizeof(uint64_t);
+		in += sizeof(uint8_t);
 	}
 
 	while (1) {
-		struct lz4_block_header b = { .raw = le32toh(*(u32 *)in) };
+		struct lz4_block_header b = { .raw = le32toh(*(uint32_t *)in) };
 		in += sizeof(struct lz4_block_header);
 
-		if (in - src + b.size > srcn)
-			return 0;		/* input overrun */
+		if ((size_t)(in - src) + b.size > srcn)
+			break;			/* input overrun */
 
-		if (!b.size)
-			return out - dst;	/* decompression successful */
+		if (!b.size) {
+			out_size = out - dst;
+			break;			/* decompression successful */
+		}
 
 		if (b.not_compressed) {
-			size_t size = MIN((u32)b.size, dst + dstn - out);
+			size_t size = MIN((uint32_t)b.size, dst + dstn - out);
 			memcpy(out, in, size);
 			if (size < b.size)
-				return 0;	/* output overrun */
+				break;		/* output overrun */
 			else
 				out += size;
 		} else {
@@ -144,15 +165,17 @@ size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn)
 					dst + dstn - out, endOnInputSize,
 					full, 0, noDict, out, NULL, 0);
 			if (ret < 0)
-				return 0;	/* decompression error */
+				break;		/* decompression error */
 			else
 				out += ret;
 		}
 
 		in += b.size;
 		if (has_block_checksum)
-			in += sizeof(u32);
+			in += sizeof(uint32_t);
 	}
+
+	return out_size;
 }
 
 size_t ulz4f(const void *src, void *dst)
author	Julius Werner <jwerner@chromium.org>	2015-09-29 13:51:35 -0700
committer	Julius Werner <jwerner@chromium.org>	2016-02-22 21:38:37 +0100
commit	09f2921b5dacaf79b391652cecd606be4dd69f50 (patch)
tree	581236de48a22240cb1b823195a79493f037fb64 /payloads
parent	0e3d7de7410c4ff7c9465261b58524675a0329e2 (diff)
download	coreboot-09f2921b5dacaf79b391652cecd606be4dd69f50.tar.gz coreboot-09f2921b5dacaf79b391652cecd606be4dd69f50.tar.bz2 coreboot-09f2921b5dacaf79b391652cecd606be4dd69f50.zip