Btrfs: heuristic: add byte set calculation

Calculate byte set size for data sample: - calculate how many unique bytes have been in the sample - for all bytes count > 0, check if we're still in the low count range (~25%), such data are easily compressible, otherwise furhter analysis is needed Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com> Reviewed-by: David Sterba <dsterba@suse.com> [ update comments ] Signed-off-by: David Sterba <dsterba@suse.com>
author: Timofey Titovets <nefelim4ag@gmail.com> 2017-09-28 17:33:40 +0300
committer: David Sterba <dsterba@suse.com> 2017-11-01 20:45:36 +0100
commit: a288e92cacdc4729ad8f83d018fb0f3f5ded6c37 (patch)
tree: d305a7ac88dfe4507782d0b7f92f244087c36fb5
parent: 1fe4f6fa5ae7dd1e63145e1ced7b9b38854da9f4 (diff)
download: linux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.tar.gz
linux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.tar.bz2
linux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.zip
1 files changed, 45 insertions, 0 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0d445c815ca2..e949f078a81b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1222,6 +1222,45 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 	return 1;
 }
 
+/*
+ * Count byte values in buckets.
+ * This heuristic can detect textual data (configs, xml, json, html, etc).
+ * Because in most text-like data byte set is restricted to limited number of
+ * possible characters, and that restriction in most cases makes data easy to
+ * compress.
+ *
+ * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
+ *	less - compressible
+ *	more - need additional analysis
+ */
+#define BYTE_SET_THRESHOLD		(64)
+
+static u32 byte_set_size(const struct heuristic_ws *ws)
+{
+	u32 i;
+	u32 byte_set_size = 0;
+
+	for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
+		if (ws->bucket[i].count > 0)
+			byte_set_size++;
+	}
+
+	/*
+	 * Continue collecting count of byte values in buckets.  If the byte
+	 * set size is bigger then the threshold, it's pointless to continue,
+	 * the detection technique would fail for this type of data.
+	 */
+	for (; i < BUCKET_SIZE; i++) {
+		if (ws->bucket[i].count > 0) {
+			byte_set_size++;
+			if (byte_set_size > BYTE_SET_THRESHOLD)
+				return byte_set_size;
+		}
+	}
+
+	return byte_set_size;
+}
+
 static bool sample_repeated_patterns(struct heuristic_ws *ws)
 {
 	const u32 half_of_sample = ws->sample_size / 2;
@@ -1321,6 +1360,12 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 		ws->bucket[byte].count++;
 	}
 
+	i = byte_set_size(ws);
+	if (i < BYTE_SET_THRESHOLD) {
+		ret = 2;
+		goto out;
+	}
+
 out:
 	__free_workspace(0, ws_list, true);
 	return ret;
author	Timofey Titovets <nefelim4ag@gmail.com>	2017-09-28 17:33:40 +0300
committer	David Sterba <dsterba@suse.com>	2017-11-01 20:45:36 +0100
commit	a288e92cacdc4729ad8f83d018fb0f3f5ded6c37 (patch)
tree	d305a7ac88dfe4507782d0b7f92f244087c36fb5
parent	1fe4f6fa5ae7dd1e63145e1ced7b9b38854da9f4 (diff)
download	linux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.tar.gz linux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.tar.bz2 linux-a288e92cacdc4729ad8f83d018fb0f3f5ded6c37.zip