summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/compression.c454
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/ctree.h99
-rw-r--r--fs/btrfs/disk-io.c18
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c27
-rw-r--r--fs/btrfs/extent_io.c411
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c9
-rw-r--r--fs/btrfs/extent_map.h6
-rw-r--r--fs/btrfs/file-item.c75
-rw-r--r--fs/btrfs/file.c263
-rw-r--r--fs/btrfs/inode.c584
-rw-r--r--fs/btrfs/ordered-data.c9
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/super.c10
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zlib.c637
22 files changed, 2315 insertions, 379 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 18f5a85b47c6..31cce5d88b1a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -501,6 +501,8 @@ config BTRFS_FS
tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
depends on EXPERIMENTAL
select LIBCRC32C
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
help
Btrfs is a new filesystem with extents, writable snapshotting,
support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142b..d2cf5a54a4b8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
- ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+ compression.o
else
# Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..c5470367ca5c
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+ /* number of bios pending for this compressed extent */
+ atomic_t pending_bios;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* number of bytes in the inode we're working on */
+ unsigned long len;
+
+ /* number of bytes on disk */
+ unsigned long compressed_len;
+
+ /* number of compressed pages in the array */
+ unsigned long nr_pages;
+
+ /* IO errors */
+ int errors;
+
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+ u64 first_byte, gfp_t gfp_flags)
+{
+ struct bio *bio;
+ int nr_vecs;
+
+ nr_vecs = bio_get_nr_vecs(bdev);
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_size = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_sector = first_byte >> 9;
+ }
+ return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+ int ret;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ /* ok, we're the last bio for this extent, lets start
+ * the decompression.
+ */
+ inode = cb->inode;
+ tree = &BTRFS_I(inode)->io_tree;
+ ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+ cb->start,
+ cb->orig_bio->bi_io_vec,
+ cb->orig_bio->bi_vcnt,
+ cb->compressed_len);
+ if (ret)
+ cb->errors = 1;
+
+ /* release the compressed pages */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* do io completion on the original bio */
+ if (cb->errors)
+ bio_io_error(cb->orig_bio);
+ else
+ bio_endio(cb->orig_bio, 0);
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+ struct page *pages[16];
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int ret;
+
+ while(nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min(nr_pages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ nr_pages -= 1;
+ index += 1;
+ continue;
+ }
+ for (i = 0; i < ret; i++) {
+ end_page_writeback(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ }
+ /* the inode may be gone now */
+ return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ /* ok, we're the last bio for this extent, step one is to
+ * call back into the FS and do all the end_io operations
+ */
+ inode = cb->inode;
+ tree = &BTRFS_I(inode)->io_tree;
+ tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+ cb->start,
+ cb->start + cb->len - 1,
+ NULL, 1);
+
+ end_compressed_writeback(inode, cb->start, cb->len);
+ /* note, our inode could be gone now */
+
+ /*
+ * release the compressed pages, these came from alloc_page and
+ * are not attached to the inode at all
+ */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages)
+{
+ struct bio *bio = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct compressed_bio *cb;
+ unsigned long bytes_left;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int page_index = 0;
+ struct page *page;
+ u64 first_byte = disk_start;
+ struct block_device *bdev;
+ int ret;
+
+ WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+ cb = kmalloc(sizeof(*cb), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->start = start;
+ cb->len = len;
+ cb->compressed_pages = compressed_pages;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = NULL;
+ cb->nr_pages = nr_pages;
+
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ ret = btrfs_csum_file_bytes(root, inode, start, len);
+ BUG_ON(ret);
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ atomic_inc(&cb->pending_bios);
+
+ /* create and submit bios for the compressed pages */
+ bytes_left = compressed_len;
+ while(bytes_left > 0) {
+ page = compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ if (bio->bi_size)
+ ret = io_tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ bio, 0);
+ else
+ ret = 0;
+
+ if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ atomic_inc(&cb->pending_bios);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ page_index++;
+ bytes_left -= PAGE_CACHE_SIZE;
+ first_byte += PAGE_CACHE_SIZE;
+ }
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+ return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it. We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *em_tree;
+ struct compressed_bio *cb;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ unsigned long compressed_len;
+ unsigned long nr_pages;
+ unsigned long page_index;
+ struct page *page;
+ struct block_device *bdev;
+ struct bio *comp_bio;
+ u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+ struct extent_map *em;
+ int ret;
+
+ tree = &BTRFS_I(inode)->io_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ /* we need the actual starting offset of this extent in the file */
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree,
+ page_offset(bio->bi_io_vec->bv_page),
+ PAGE_CACHE_SIZE);
+ spin_unlock(&em_tree->lock);
+
+ cb = kmalloc(sizeof(*cb), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+
+ cb->start = em->start;
+ compressed_len = em->block_len;
+ free_extent_map(em);
+
+ cb->len = uncompressed_len;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = bio;
+
+ nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+ GFP_NOFS);
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+ __GFP_HIGHMEM);
+ }
+ cb->nr_pages = nr_pages;
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+ atomic_inc(&cb->pending_bios);
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ page = cb->compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ if (comp_bio->bi_size)
+ ret = tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ comp_bio, 0);
+ else
+ ret = 0;
+
+ if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+ GFP_NOFS);
+ atomic_inc(&cb->pending_bios);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ cur_disk_byte += PAGE_CACHE_SIZE;
+ }
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+ return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47f..793d8fdda244 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
__le32 nsec;
} __attribute__ ((__packed__));
-/*
- * there is no padding here on purpose. If you want to extent the inode,
- * make a new item type
- */
+typedef enum {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+ BTRFS_ENCRYPTION_NONE = 0,
+ BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
struct btrfs_inode_item {
/* nfs style generation number */
__le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
__le64 rdev;
__le16 flags;
__le16 compat_flags;
+
struct btrfs_timespec atime;
struct btrfs_timespec ctime;
struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
#define BTRFS_FILE_EXTENT_INLINE 1
struct btrfs_file_extent_item {
+ /*
+ * transaction id that created this extent
+ */
__le64 generation;
+ /*
+ * max number of bytes to hold this extent in ram
+ * when we split a compressed extent we can't know how big
+ * each of the resulting pieces will be. So, this is
+ * an upper limit on the size of the extent in ram instead of
+ * an exact limit.
+ */
+ __le64 ram_bytes;
+
+ /*
+ * 32 bits for the various ways we might encode the data,
+ * including compression and encryption. If any of these
+ * are set to something a given disk format doesn't understand
+ * it is treated like an incompat flag for reading and writing,
+ * but not for stat.
+ */
+ u8 compression;
+ u8 encryption;
+ __le16 other_encoding; /* spare for later use */
+
+ /* are we inline data or a real extent? */
u8 type;
+
/*
* disk space consumed by the extent, checksum blocks are included
* in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
*/
__le64 offset;
/*
- * the logical number of file blocks (no csums included)
+ * the logical number of file blocks (no csums included). This
+ * always reflects the size uncompressed and without encoding.
*/
__le64 num_bytes;
+
} __attribute__ ((__packed__));
struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
#define BTRFS_MOUNT_NOBARRIER (1 << 2)
#define BTRFS_MOUNT_SSD (1 << 3)
#define BTRFS_MOUNT_DEGRADED (1 << 4)
+#define BTRFS_MOUNT_COMPRESS (1 << 5)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
#define BTRFS_INODE_NODATASUM (1 << 0)
#define BTRFS_INODE_NODATACOW (1 << 1)
#define BTRFS_INODE_READONLY (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS (1 << 3)
#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
~BTRFS_INODE_##flag)
#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
}
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
- struct btrfs_item *e)
-{
- unsigned long offset;
- offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return btrfs_item_size(eb, e) - offset;
-}
-
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
offset, 64);
BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+ struct btrfs_file_extent_item *e)
+{
+ return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers. If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+ struct btrfs_item *e)
+{
+ unsigned long offset;
+ offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+ return btrfs_item_size(eb, e) - offset;
+}
static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
{
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos, u64 disk_offset,
- u64 disk_num_bytes,
- u64 num_bytes, u64 offset);
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+ u64 start, unsigned long len);
struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
int namelen);
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio);
+ size_t size, struct bio *bio, unsigned long bio_flags);
unsigned long btrfs_force_ra(struct address_space *mapping,
struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb6194..dc95f636a11b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
extent_submit_bio_hook_t *submit_bio_hook;
int rw;
int mirror_num;
+ unsigned long bio_flags;
struct btrfs_work work;
};
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
}
em->start = 0;
em->len = (u64)-1;
+ em->block_len = (u64)-1;
em->block_start = 0;
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
wake_up(&fs_info->async_submit_wait);
async->submit_bio_hook(async->inode, async->rw, async->bio,
- async->mirror_num);
+ async->mirror_num, async->bio_flags);
kfree(async);
}
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
extent_submit_bio_hook_t *submit_bio_hook)
{
struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->submit_bio_hook = submit_bio_hook;
async->work.func = run_one_async_submit;
async->work.flags = 0;
+ async->bio_flags = bio_flags;
while(atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
}
static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num)
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num)
+ int mirror_num, unsigned long bio_flags)
{
/*
* kthread helpers are used to submit writes so that checksumming
* can happen in parallel across all CPUs
*/
if (!(rw & (1 << BIO_RW))) {
- return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+ return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
}
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, mirror_num,
+ inode, rw, bio, mirror_num, 0,
__btree_submit_bio_hook);
}
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->btree_inode = new_inode(sb);
fs_info->btree_inode->i_ino = 1;
fs_info->btree_inode->i_nlink = 1;
+
fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
*/
btrfs_init_workers(&fs_info->workers, "worker",
fs_info->thread_pool_size);
+
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+ 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
nodesize = btrfs_super_nodesize(disk_super);
leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbbb..4eb1f1408d21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
extent_submit_bio_hook_t *submit_bio_hook);
int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6d..bbf04e80a1a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
em->start = extent_key->objectid - offset;
em->len = extent_key->offset;
+ em->block_len = extent_key->offset;
em->block_start = extent_key->objectid;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
};
struct disk_extent {
+ u64 ram_bytes;
u64 disk_bytenr;
u64 disk_num_bytes;
u64 offset;
u64 num_bytes;
+ u8 compression;
+ u8 encryption;
+ u16 other_encoding;
};
static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
btrfs_file_extent_disk_num_bytes(leaf, fi);
exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+ exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+ exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+ exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+ fi);
WARN_ON(exts[nr].offset > 0);
WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
@@ -3846,6 +3856,8 @@ next:
new_extents[0].disk_bytenr);
btrfs_set_file_extent_disk_num_bytes(leaf, fi,
new_extents[0].disk_num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi,
+ new_extents[0].ram_bytes);
ext_offset += new_extents[0].offset;
btrfs_set_file_extent_offset(leaf, fi, ext_offset);
btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
new_extents[i].disk_bytenr);
btrfs_set_file_extent_disk_num_bytes(leaf, fi,
new_extents[i].disk_num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi,
+ new_extents[i].ram_bytes);
+
+ btrfs_set_file_extent_compression(leaf, fi,
+ new_extents[i].compression);
+ btrfs_set_file_extent_encryption(leaf, fi,
+ new_extents[i].encryption);
+ btrfs_set_file_extent_other_encoding(leaf, fi,
+ new_extents[i].other_encoding);
+
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_len);
ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_ram_bytes(leaf, fi,
+ new_extent->ram_bytes);
btrfs_set_file_extent_disk_bytenr(leaf, fi,
new_extent->disk_bytenr);
btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
BUG_ON(err);
err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
- group->key.offset, 0);
+ group->key.offset, 0, group->key.offset,
+ 0, 0, 0);
BUG_ON(err);
inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f2..314041fdfa43 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
static LIST_HEAD(buffers);
static LIST_HEAD(states);
+#define LEAK_DEBUG 1
#ifdef LEAK_DEBUG
static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
#endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
-static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
- u64 *start, u64 *end, u64 max_bytes)
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+ u64 *start, u64 *end, u64 max_bytes)
{
struct rb_node *node;
struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
u64 total_bytes = 0;
spin_lock_irq(&tree->lock);
+
/*
* this search will find all the extents that end after
* our range starts.
*/
-search_again:
node = tree_search(tree, cur_start);
if (!node) {
if (!found)
@@ -1100,40 +1101,6 @@ search_again:
*end = state->end;
goto out;
}
- if (!found && !(state->state & EXTENT_BOUNDARY)) {
- struct extent_state *prev_state;
- struct rb_node *prev_node = node;
- while(1) {
- prev_node = rb_prev(prev_node);
- if (!prev_node)
- break;
- prev_state = rb_entry(prev_node,
- struct extent_state,
- rb_node);
- if ((prev_state->end + 1 != state->start) ||
- !(prev_state->state & EXTENT_DELALLOC))
- break;
- if ((cur_start - prev_state->start) * 2 >
- max_bytes)
- break;
- state = prev_state;
- node = prev_node;
- }
- }
- if (state->state & EXTENT_LOCKED) {
- DEFINE_WAIT(wait);
- atomic_inc(&state->refs);
- prepare_to_wait(&state->wq, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&tree->lock);
- schedule();
- spin_lock_irq(&tree->lock);
- finish_wait(&state->wq, &wait);
- free_extent_state(state);
- goto search_again;
- }
- set_state_cb(tree, state, EXTENT_LOCKED);
- state->state |= EXTENT_LOCKED;
if (!found)
*start = state->start;
found++;
@@ -1151,6 +1118,208 @@ out:
return found;
}
+static noinline int __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+
+ if (index == locked_page->index && end_index == index)
+ return 0;
+
+ while(nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min(nr_pages, ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (pages[i] != locked_page)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+ struct page *locked_page,
+ u64 delalloc_start,
+ u64 delalloc_end)
+{
+ unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+ unsigned long start_index = index;
+ unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+ unsigned long pages_locked = 0;
+ struct page *pages[16];
+ unsigned long nrpages;
+ int ret;
+ int i;
+
+ /* the caller is responsible for locking the start index */
+ if (index == locked_page->index && index == end_index)
+ return 0;
+
+ /* skip the page at the start index */
+ nrpages = end_index - index + 1;
+ while(nrpages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min(nrpages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ /* now we have an array of pages, lock them all */
+ for (i = 0; i < ret; i++) {
+ /*
+ * the caller is taking responsibility for
+ * locked_page
+ */
+ if (pages[i] != locked_page)
+ lock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ pages_locked += ret;
+ nrpages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ ret = 0;
+done:
+ if (ret && pages_locked) {
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start,
+ ((u64)(start_index + pages_locked - 1)) <<
+ PAGE_CACHE_SHIFT);
+ }
+ return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'. start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page,
+ u64 *start, u64 *end,
+ u64 max_bytes)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 found;
+ int ret;
+ int loops = 0;
+
+again:
+ /* step one, find a bunch of delalloc bytes starting at start */
+ delalloc_start = *start;
+ delalloc_end = 0;
+ found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes);
+ if (!found) {
+ *start = delalloc_start;
+ *end = delalloc_end;
+ return found;
+ }
+
+ /*
+ * make sure to limit the number of pages we try to lock down
+ * if we're looping.
+ */
+ if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+ delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
+ ~((u64)PAGE_CACHE_SIZE - 1);
+ }
+ /* step two, lock all the pages after the page that has start */
+ ret = lock_delalloc_pages(inode, locked_page,
+ delalloc_start, delalloc_end);
+ if (ret == -EAGAIN) {
+ /* some of the pages are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching
+ */
+ if (!loops) {
+ unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+ max_bytes = PAGE_CACHE_SIZE - offset;
+ loops = 1;
+ goto again;
+ } else {
+ found = 0;
+ goto out_failed;
+ }
+ }
+ BUG_ON(ret);
+
+ /* step three, lock the state bits for the whole range */
+ lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+ /* then test to make sure it is all still delalloc */
+ ret = test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, 1);
+ if (!ret) {
+ unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start, delalloc_end);
+ cond_resched();
+ goto again;
+ }
+ *start = delalloc_start;
+ *end = delalloc_end;
+out_failed:
+ return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+ int clear_dirty, int set_writeback,
+ int end_writeback)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+
+ if (clear_dirty)
+ clear_bits |= EXTENT_DIRTY;
+
+ clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+
+ while(nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min(nr_pages, ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (pages[i] == locked_page) {
+ page_cache_release(pages[i]);
+ continue;
+ }
+ if (clear_dirty)
+ clear_page_dirty_for_io(pages[i]);
+ if (set_writeback)
+ set_page_writeback(pages[i]);
+ if (end_writeback)
+ end_page_writeback(pages[i]);
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ return 0;
+}
+EXPORT_SYMBOL(extent_clear_unlock_delalloc);
+
/*
* count the number of bytes in the tree that have a given bit(s)
* set. This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
int ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
- struct rb_node *node;
- struct extent_state *state;
u64 start;
u64 end;
start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
- spin_lock_irq(&tree->lock);
- node = __etree_search(tree, start, NULL, NULL);
- BUG_ON(!node);
- state = rb_entry(node, struct extent_state, rb_node);
- while(state->end < end) {
- node = rb_next(node);
- state = rb_entry(node, struct extent_state, rb_node);
- }
- BUG_ON(state->end != end);
- spin_unlock_irq(&tree->lock);
-
bio->bi_private = NULL;
bio_get(bio);
if (tree->ops && tree->ops->submit_bio_hook)
tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
- mirror_num);
+ mirror_num, bio_flags);
else
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
struct bio **bio_ret,
unsigned long max_pages,
bio_end_io_t end_io_func,
- int mirror_num)
+ int mirror_num,
+ unsigned long prev_bio_flags,
+ unsigned long bio_flags)
{
int ret = 0;
struct bio *bio;
int nr;
+ int contig = 0;
+ int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+ int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+ size_t page_size = min(size, PAGE_CACHE_SIZE);
if (bio_ret && *bio_ret) {
bio = *bio_ret;
- if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+ if (old_compressed)
+ contig = bio->bi_sector == sector;
+ else
+ contig = bio->bi_sector + (bio->bi_size >> 9) ==
+ sector;
+
+ if (prev_bio_flags != bio_flags || !contig ||
(tree->ops && tree->ops->merge_bio_hook &&
- tree->ops->merge_bio_hook(page, offset, size, bio)) ||
- bio_add_page(bio, page, size, offset) < size) {
- ret = submit_one_bio(rw, bio, mirror_num);
+ tree->ops->merge_bio_hook(page, offset, page_size, bio,
+ bio_flags)) ||
+ bio_add_page(bio, page, page_size, offset) < page_size) {
+ ret = submit_one_bio(rw, bio, mirror_num,
+ prev_bio_flags);
bio = NULL;
} else {
return 0;
}
}
- nr = bio_get_nr_vecs(bdev);
+ if (this_compressed)
+ nr = BIO_MAX_PAGES;
+ else
+ nr = bio_get_nr_vecs(bdev);
+
bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
if (!bio) {
printk("failed to allocate bio nr %d\n", nr);
}
-
- bio_add_page(bio, page, size, offset);
+ bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
if (bio_ret) {
*bio_ret = bio;
} else {
- ret = submit_one_bio(rw, bio, mirror_num);
+ ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
}
return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
static int __extent_read_full_page(struct extent_io_tree *tree,
struct page *page,
get_extent_t *get_extent,
- struct bio **bio, int mirror_num)
+ struct bio **bio, int mirror_num,
+ unsigned long *bio_flags)
{
struct inode *inode = page->mapping->host;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
int nr = 0;
size_t page_offset = 0;
size_t iosize;
+ size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
+ unsigned long this_bio_flag = 0;
set_page_extent_mapped(page);
end = page_end;
lock_extent(tree, start, end, GFP_NOFS);
+ if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+ char *userpage;
+ size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+ if (zero_offset) {
+ iosize = PAGE_CACHE_SIZE - zero_offset;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + zero_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+ }
+ }
while (cur <= end) {
if (cur >= last_byte) {
char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
}
BUG_ON(end < cur);
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = EXTENT_BIO_COMPRESSED;
+
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
- sector = (em->block_start + extent_offset) >> 9;
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+ disk_io_size = em->block_len;
+ sector = em->block_start >> 9;
+ } else {
+ sector = (em->block_start + extent_offset) >> 9;
+ disk_io_size = iosize;
+ }
bdev = em->bdev;
block_start = em->block_start;
free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
pnr -= page->index;
ret = submit_extent_page(READ, tree, page,
- sector, iosize, page_offset,
+ sector, disk_io_size, page_offset,
bdev, bio, pnr,
- end_bio_extent_readpage, mirror_num);
+ end_bio_extent_readpage, mirror_num,
+ *bio_flags,
+ this_bio_flag);
nr++;
+ *bio_flags = this_bio_flag;
}
if (ret)
SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent)
{
struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
int ret;
- ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
+ ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+ &bio_flags);
if (bio)
- submit_one_bio(READ, bio, 0);
+ submit_one_bio(READ, bio, 0, bio_flags);
return ret;
}
EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
u64 nr_delalloc;
u64 delalloc_end;
+ int page_started;
+ int compressed;
WARN_ON(!PageLocked(page));
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_start = start;
delalloc_end = 0;
+ page_started = 0;
while(delalloc_end < page_end) {
- nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+ nr_delalloc = find_lock_delalloc_range(inode, tree,
+ page,
+ &delalloc_start,
&delalloc_end,
128 * 1024 * 1024);
if (nr_delalloc == 0) {
delalloc_start = delalloc_end + 1;
continue;
}
- tree->ops->fill_delalloc(inode, delalloc_start,
- delalloc_end);
- clear_extent_bit(tree, delalloc_start,
- delalloc_end,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- 1, 0, GFP_NOFS);
+ tree->ops->fill_delalloc(inode, page, delalloc_start,
+ delalloc_end, &page_started);
delalloc_start = delalloc_end + 1;
}
+
+ /* did the fill delalloc function already unlock and start the IO? */
+ if (page_started) {
+ return 0;
+ }
+
lock_extent(tree, start, page_end, GFP_NOFS);
unlock_start = start;
if (tree->ops && tree->ops->writepage_start_hook) {
- ret = tree->ops->writepage_start_hook(page, start, page_end);
+ ret = tree->ops->writepage_start_hook(page, start,
+ page_end);
if (ret == -EAGAIN) {
unlock_extent(tree, start, page_end, GFP_NOFS);
redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
sector = (em->block_start + extent_offset) >> 9;
bdev = em->bdev;
block_start = em->block_start;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
free_extent_map(em);
em = NULL;
- if (block_start == EXTENT_MAP_HOLE ||
+ /*
+ * compressed and inline extents are written through other
+ * paths in the FS
+ */
+ if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
clear_extent_dirty(tree, cur,
cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
unlock_extent(tree, unlock_start, cur + iosize -1,
GFP_NOFS);
- if (tree->ops && tree->ops->writepage_end_io_hook)
+ /*
+ * end_io notification does not happen here for
+ * compressed extents
+ */
+ if (!compressed && tree->ops &&
+ tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
cur + iosize - 1,
NULL, 1);
- cur = cur + iosize;
+ else if (compressed) {
+ /* we don't want to end_page_writeback on
+ * a compressed extent. this happens
+ * elsewhere
+ */
+ nr++;
+ }
+
+ cur += iosize;
pg_offset += iosize;
unlock_start = cur;
continue;
}
-
/* leave this out until we have a page_mkwrite call */
if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset += iosize;
continue;
}
+
clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
ret = submit_extent_page(WRITE, tree, page, sector,
iosize, pg_offset, bdev,
&epd->bio, max_nr,
- end_bio_extent_writepage, 0);
+ end_bio_extent_writepage,
+ 0, 0, 0);
if (ret)
SetPageError(page);
}
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
extent_write_cache_pages(tree, mapping, &wbc_writepages,
__extent_writepage, &epd);
if (epd.bio) {
- submit_one_bio(WRITE, epd.bio, 0);
+ submit_one_bio(WRITE, epd.bio, 0, 0);
}
return ret;
}
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
ret = extent_write_cache_pages(tree, mapping, wbc,
__extent_writepage, &epd);
if (epd.bio) {
- submit_one_bio(WRITE, epd.bio, 0);
+ submit_one_bio(WRITE, epd.bio, 0, 0);
}
return ret;
}
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct bio *bio = NULL;
unsigned page_idx;
struct pagevec pvec;
+ unsigned long bio_flags = 0;
pagevec_init(&pvec, 0);
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
if (!pagevec_add(&pvec, page))
__pagevec_lru_add(&pvec);
__extent_read_full_page(tree, page, get_extent,
- &bio, 0);
+ &bio, 0, &bio_flags);
}
page_cache_release(page);
}
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
__pagevec_lru_add(&pvec);
BUG_ON(!list_empty(pages));
if (bio)
- submit_one_bio(READ, bio, 0);
+ submit_one_bio(READ, bio, 0, bio_flags);
return 0;
}
EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset, em->bdev,
NULL, 1,
- end_bio_extent_preparewrite, 0);
+ end_bio_extent_preparewrite, 0,
+ 0, 0);
iocount++;
block_start = block_start + iosize;
} else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
}
if (!test_range_bit(tree, em->start,
extent_map_end(em) - 1,
- EXTENT_LOCKED, 0)) {
+ EXTENT_LOCKED | EXTENT_WRITEBACK |
+ EXTENT_ORDERED,
+ 0)) {
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
int inc_all_pages = 0;
unsigned long num_pages;
struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
if (eb->flags & EXTENT_UPTODATE)
return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
- mirror_num);
+ mirror_num, &bio_flags);
if (err) {
ret = err;
printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
}
if (bio)
- submit_one_bio(READ, bio, mirror_num);
+ submit_one_bio(READ, bio, mirror_num, bio_flags);
if (ret || !wait) {
if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c9d1908a1ae3..86f859b87a6e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
#define EXTENT_BOUNDARY (1 << 11)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
/*
* page->private values. Every page that is controlled by the extent
* map has page->private set to one.
@@ -28,14 +31,17 @@
struct extent_state;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
- struct bio *bio, int mirror_num);
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
struct extent_io_ops {
- int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+ int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started);
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
extent_submit_bio_hook_t *submit_bio_hook;
int (*merge_bio_hook)(struct page *page, unsigned long offset,
- size_t size, struct bio *bio);
+ size_t size, struct bio *bio,
+ unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
int release_extent_buffer_tail_pages(struct extent_buffer *eb);
int extent_range_uptodate(struct extent_io_tree *tree,
u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+ int clear_dirty, int set_writeback,
+ int clear_writeback);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 74b2a29880d3..fd3ebfb8c3c5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
return 0;
+ /*
+ * don't merge compressed extents, we need to know their
+ * actual size
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+ return 0;
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
if (rb && mergable_maps(merge, em)) {
em->start = merge->start;
em->len += merge->len;
+ em->block_len += merge->block_len;
em->block_start = merge->block_start;
merge->in_tree = 0;
rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(em, merge)) {
em->len += merge->len;
+ em->block_len += merge->len;
rb_erase(&merge->rb_node, &tree->map);
merge->in_tree = 0;
free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 26ac6fe0b268..abbcbeb28c79 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
/* bits for the flags field */
#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
struct extent_map {
struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
u64 start;
u64 len;
u64 block_start;
+ u64 block_len;
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
static inline u64 extent_map_block_end(struct extent_map *em)
{
- if (em->block_start + em->len < em->block_start)
+ if (em->block_start + em->block_len < em->block_start)
return (u64)-1;
- return em->block_start + em->len;
+ return em->block_start + em->block_len;
}
void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6dbe88b9d7d4..f4d3fa71bc41 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset)
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding)
{
int ret = 0;
struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
btrfs_set_file_extent_offset(leaf, item, offset);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, compression);
+ btrfs_set_file_extent_encryption(leaf, item, encryption);
+ btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
return 0;
}
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+ u64 start, unsigned long len)
+{
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_sector_sum *sector_sum;
+ struct btrfs_ordered_extent *ordered;
+ char *data;
+ struct page *page;
+ unsigned long total_bytes = 0;
+ unsigned long this_sum_bytes = 0;
+
+ sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+ if (!sums)
+ return -ENOMEM;
+
+ sector_sum = sums->sums;
+ sums->file_offset = start;
+ sums->len = len;
+ INIT_LIST_HEAD(&sums->list);
+ ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+ BUG_ON(!ordered);
+
+ while(len > 0) {
+ if (start >= ordered->file_offset + ordered->len ||
+ start < ordered->file_offset) {
+ sums->len = this_sum_bytes;
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+
+ sums = kzalloc(btrfs_ordered_sum_size(root, len),
+ GFP_NOFS);
+ BUG_ON(!sums);
+ sector_sum = sums->sums;
+ sums->len = len;
+ sums->file_offset = start;
+ ordered = btrfs_lookup_ordered_extent(inode,
+ sums->file_offset);
+ BUG_ON(!ordered);
+ }
+
+ page = find_get_page(inode->i_mapping,
+ start >> PAGE_CACHE_SHIFT);
+
+ data = kmap_atomic(page, KM_USER0);
+ sector_sum->sum = ~(u32)0;
+ sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
+ PAGE_CACHE_SIZE);
+ kunmap_atomic(data, KM_USER0);
+ btrfs_csum_final(sector_sum->sum,
+ (char *)&sector_sum->sum);
+ sector_sum->offset = page_offset(page);
+ page_cache_release(page);
+
+ sector_sum++;
+ total_bytes += PAGE_CACHE_SIZE;
+ this_sum_bytes += PAGE_CACHE_SIZE;
+ start += PAGE_CACHE_SIZE;
+
+ WARN_ON(len < PAGE_CACHE_SIZE);
+ len -= PAGE_CACHE_SIZE;
+ }
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio)
{
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add2..0aa15436590e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
}
}
-/* this does all the hard work for inserting an inline extent into
- * the btree. Any existing inline extent is extended as required to make room,
- * otherwise things are inserted as required into the btree
- */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- u64 offset, size_t size,
- struct page **pages, size_t page_offset,
- int num_pages)
-{
- struct btrfs_key key;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- char *kaddr;
- unsigned long ptr;
- struct btrfs_file_extent_item *ei;
- struct page *page;
- u32 datasize;
- int err = 0;
- int ret;
- int i;
- ssize_t cur_size;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- btrfs_set_trans_block_group(trans, inode);
-
- key.objectid = inode->i_ino;
- key.offset = offset;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
- if (ret == 1) {
- struct btrfs_key found_key;
-
- if (path->slots[0] == 0)
- goto insert;
-
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- if (found_key.objectid != inode->i_ino)
- goto insert;
-
- if (found_key.type != BTRFS_EXTENT_DATA_KEY)
- goto insert;
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_type(leaf, ei) !=
- BTRFS_FILE_EXTENT_INLINE) {
- goto insert;
- }
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- ret = 0;
- }
- if (ret == 0) {
- u32 found_size;
- u64 found_end;
-
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_type(leaf, ei) !=
- BTRFS_FILE_EXTENT_INLINE) {
- err = ret;
- btrfs_print_leaf(root, leaf);
- printk("found wasn't inline offset %Lu inode %lu\n",
- offset, inode->i_ino);
- goto fail;
- }
- found_size = btrfs_file_extent_inline_len(leaf,
- btrfs_item_nr(leaf, path->slots[0]));
- found_end = key.offset + found_size;
-
- if (found_end < offset + size) {
- btrfs_release_path(root, path);
- ret = btrfs_search_slot(trans, root, &key, path,
- offset + size - found_end, 1);
- BUG_ON(ret != 0);
-
- ret = btrfs_extend_item(trans, root, path,
- offset + size - found_end);
- if (ret) {
- err = ret;
- goto fail;
- }
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- inode_add_bytes(inode, offset + size - found_end);
- }
- if (found_end < offset) {
- ptr = btrfs_file_extent_inline_start(ei) + found_size;
- memset_extent_buffer(leaf, 0, ptr, offset - found_end);
- }
- } else {
-insert:
- btrfs_release_path(root, path);
- datasize = offset + size - key.offset;
- inode_add_bytes(inode, datasize);
- datasize = btrfs_file_extent_calc_inline_size(datasize);
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- datasize);
- if (ret) {
- err = ret;
- printk("got bad ret %d\n", ret);
- goto fail;
- }
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(leaf, ei, trans->transid);
- btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
- }
- ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-
- cur_size = size;
- i = 0;
- while (size > 0) {
- page = pages[i];
- kaddr = kmap_atomic(page, KM_USER0);
- cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
- write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
- kunmap_atomic(kaddr, KM_USER0);
- page_offset = 0;
- ptr += cur_size;
- size -= cur_size;
- if (i >= num_pages) {
- printk("i %d num_pages %d\n", i, num_pages);
- }
- i++;
- }
- btrfs_mark_buffer_dirty(leaf);
-fail:
- btrfs_free_path(path);
- return err;
-}
-
/*
* after copy_from_user, pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
- u64 inline_size;
- int did_inline = 0;
loff_t isize = i_size_read(inode);
start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
err = btrfs_insert_file_extent(trans, root,
inode->i_ino,
last_pos_in_file,
- 0, 0, hole_size, 0);
+ 0, 0, hole_size, 0,
+ hole_size, 0, 0, 0);
btrfs_drop_extent_cache(inode, last_pos_in_file,
last_pos_in_file + hole_size - 1, 0);
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
goto failed;
}
- /*
- * either allocate an extent for the new bytes or setup the key
- * to show we are doing inline data in the extent
+ /* check for reserved extents on each page, we don't want
+ * to reset the delalloc bit on things that already have
+ * extents reserved.
*/
- inline_size = end_pos;
- if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
- inline_size > root->fs_info->max_inline ||
- (inline_size & (root->sectorsize -1)) == 0 ||
- inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
- /* check for reserved extents on each page, we don't want
- * to reset the delalloc bit on things that already have
- * extents reserved.
- */
- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
- for (i = 0; i < num_pages; i++) {
- struct page *p = pages[i];
- SetPageUptodate(p);
- ClearPageChecked(p);
- set_page_dirty(p);
- }
- } else {
- u64 aligned_end;
- /* step one, delete the existing extents in this range */
- aligned_end = (pos + write_bytes + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
- mutex_lock(&BTRFS_I(inode)->extent_mutex);
- err = btrfs_drop_extents(trans, root, inode, start_pos,
- aligned_end, aligned_end, &hint_byte);
- if (err)
- goto failed;
- if (isize > inline_size)
- inline_size = min_t(u64, isize, aligned_end);
- inline_size -= start_pos;
- err = insert_inline_extent(trans, root, inode, start_pos,
- inline_size, pages, 0, num_pages);
- btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
- BUG_ON(err);
- mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
- /*
- * an ugly way to do all the prop accounting around
- * the page bits and mapping tags
- */
- set_page_writeback(pages[0]);
- end_page_writeback(pages[0]);
- did_inline = 1;
+ btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+ SetPageUptodate(p);
+ ClearPageChecked(p);
+ set_page_dirty(p);
}
if (end_pos > isize) {
i_size_write(inode, end_pos);
- if (did_inline)
- BTRFS_I(inode)->disk_i_size = end_pos;
btrfs_update_inode(trans, root, inode);
}
failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int ret;
int testend = 1;
unsigned long flags;
+ int compressed = 0;
WARN_ON(end < start);
if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(em);
continue;
}
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
remove_extent_mapping(em_tree, em);
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->start = em->start;
split->len = start - em->start;
split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+
split->bdev = em->bdev;
split->flags = flags;
ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
- split->block_start = em->block_start + diff;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->block_start = em->block_start;
+ } else {
+ split->block_len = split->len;
+ split->block_start = em->block_start + diff;
+ }
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
struct btrfs_item *item;
item = btrfs_item_nr(leaf, slot);
extent_end = found_key.offset +
- btrfs_file_extent_inline_len(leaf, item);
+ btrfs_file_extent_inline_len(leaf, extent);
extent_end = (extent_end + root->sectorsize - 1) &
~((u64)root->sectorsize -1 );
}
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
u64 extent_end = 0;
u64 search_start = start;
u64 leaf_start;
+ u64 ram_bytes = 0;
+ u8 compression = 0;
+ u8 encryption = 0;
+ u16 other_encoding = 0;
u64 root_gen;
u64 root_owner;
struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
int recow;
int ret;
+ inline_limit = 0;
btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, extent);
+ compression = btrfs_file_extent_compression(leaf,
+ extent);
+ encryption = btrfs_file_extent_encryption(leaf,
+ extent);
+ other_encoding = btrfs_file_extent_other_encoding(leaf,
+ extent);
if (found_type == BTRFS_FILE_EXTENT_REG) {
extent_end =
btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
extent_end = key.offset +
btrfs_file_extent_num_bytes(leaf, extent);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+ extent);
found_extent = 1;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- struct btrfs_item *item;
- item = btrfs_item_nr(leaf, slot);
found_inline = 1;
extent_end = key.offset +
- btrfs_file_extent_inline_len(leaf, item);
+ btrfs_file_extent_inline_len(leaf, extent);
}
} else {
extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
search_start = (extent_end + mask) & ~mask;
} else
search_start = extent_end;
- if (end <= extent_end && start >= key.offset && found_inline) {
+
+ if (end <= extent_end && start >= key.offset && found_inline)
*hint_byte = EXTENT_MAP_INLINE;
- goto out;
- }
if (found_extent) {
read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
write_extent_buffer(leaf, &old,
(unsigned long)extent, sizeof(old));
+ btrfs_set_file_extent_compression(leaf, extent,
+ compression);
+ btrfs_set_file_extent_encryption(leaf, extent,
+ encryption);
+ btrfs_set_file_extent_other_encoding(leaf, extent,
+ other_encoding);
btrfs_set_file_extent_offset(leaf, extent,
le64_to_cpu(old.offset) + end - key.offset);
WARN_ON(le64_to_cpu(old.num_bytes) <
(extent_end - end));
btrfs_set_file_extent_num_bytes(leaf, extent,
extent_end - end);
+
+ /*
+ * set the ram bytes to the size of the full extent
+ * before splitting. This is a worst case flag,
+ * but its the best we can do because we don't know
+ * how splitting affects compression
+ */
+ btrfs_set_file_extent_ram_bytes(leaf, extent,
+ ram_bytes);
btrfs_set_file_extent_type(leaf, extent,
BTRFS_FILE_EXTENT_REG);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d6..9797592dc86b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
#include "compat.h"
#include "tree-log.h"
#include "ref-cache.h"
+#include "compression.h"
struct btrfs_iget_args {
u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
};
static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
/*
* a very lame attempt at stopping writes when the FS is 85% full. There
@@ -114,57 +116,374 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
}
/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree. The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, size_t size, size_t compressed_size,
+ struct page **compressed_pages)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct page *page = NULL;
+ char *kaddr;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ int err = 0;
+ int ret;
+ size_t cur_size = size;
+ size_t datasize;
+ unsigned long offset;
+ int use_compress = 0;
+
+ if (compressed_size && compressed_pages) {
+ use_compress = 1;
+ cur_size = compressed_size;
+ }
+
+ path = btrfs_alloc_path(); if (!path)
+ return -ENOMEM;
+
+ btrfs_set_trans_block_group(trans, inode);
+
+ key.objectid = inode->i_ino;
+ key.offset = start;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ inode_add_bytes(inode, size);
+ datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+ inode_add_bytes(inode, size);
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ printk("got bad ret %d\n", ret);
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+ ptr = btrfs_file_extent_inline_start(ei);
+
+ if (use_compress) {
+ struct page *cpage;
+ int i = 0;
+ while(compressed_size > 0) {
+ cpage = compressed_pages[i];
+ cur_size = min(compressed_size,
+ PAGE_CACHE_SIZE);
+
+ kaddr = kmap(cpage);
+ write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap(cpage);
+
+ i++;
+ ptr += cur_size;
+ compressed_size -= cur_size;
+ }
+ btrfs_set_file_extent_compression(leaf, ei,
+ BTRFS_COMPRESS_ZLIB);
+ } else {
+ page = find_get_page(inode->i_mapping,
+ start >> PAGE_CACHE_SHIFT);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ kaddr = kmap_atomic(page, KM_USER0);
+ offset = start & (PAGE_CACHE_SIZE - 1);
+ write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ kunmap_atomic(kaddr, KM_USER0);
+ page_cache_release(page);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ BTRFS_I(inode)->disk_i_size = inode->i_size;
+ btrfs_update_inode(trans, root, inode);
+ return 0;
+fail:
+ btrfs_free_path(path);
+ return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file. This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 start, u64 end,
+ size_t compressed_size,
+ struct page **compressed_pages)
+{
+ u64 isize = i_size_read(inode);
+ u64 actual_end = min(end + 1, isize);
+ u64 inline_len = actual_end - start;
+ u64 aligned_end = (end + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ u64 hint_byte;
+ u64 data_len = inline_len;
+ int ret;
+
+ if (compressed_size)
+ data_len = compressed_size;
+
+ if (start > 0 ||
+ data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ (!compressed_size &&
+ (actual_end & (root->sectorsize - 1)) == 0) ||
+ end + 1 < isize ||
+ data_len > root->fs_info->max_inline) {
+ return 1;
+ }
+
+ mutex_lock(&BTRFS_I(inode)->extent_mutex);
+ ret = btrfs_drop_extents(trans, root, inode, start,
+ aligned_end, aligned_end, &hint_byte);
+ BUG_ON(ret);
+
+ if (isize > actual_end)
+ inline_len = min_t(u64, isize, actual_end);
+ ret = insert_inline_extent(trans, root, inode, start,
+ inline_len, compressed_size,
+ compressed_pages);
+ BUG_ON(ret);
+ btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+ mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+ return 0;
+}
+
+/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
* allocate extents on disk for the range, and create ordered data structs
* in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already. We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it. It may be clean and already done with
+ * IO when we return.
*/
-static int cow_file_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 alloc_hint = 0;
u64 num_bytes;
+ unsigned long ram_size;
+ u64 orig_start;
+ u64 disk_num_bytes;
u64 cur_alloc_size;
u64 blocksize = root->sectorsize;
- u64 orig_num_bytes;
+ u64 actual_end;
struct btrfs_key ins;
struct extent_map *em;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 0;
+ struct page **pages = NULL;
+ unsigned long nr_pages;
+ unsigned long nr_pages_ret = 0;
+ unsigned long total_compressed = 0;
+ unsigned long total_in = 0;
+ unsigned long max_compressed = 128 * 1024;
+ unsigned long max_uncompressed = 256 * 1024;
+ int i;
+ int will_compress;
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
+ orig_start = start;
+
+ /*
+ * compression made this loop a bit ugly, but the basic idea is to
+ * compress some pages but keep the total size of the compressed
+ * extent relatively small. If compression is off, this goto target
+ * is never used.
+ */
+again:
+ will_compress = 0;
+ nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+ nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+ actual_end = min_t(u64, i_size_read(inode), end + 1);
+ total_compressed = actual_end - start;
+
+ /* we want to make sure that amount of ram required to uncompress
+ * an extent is reasonable, so we limit the total size in ram
+ * of a compressed extent to 256k
+ */
+ total_compressed = min(total_compressed, max_uncompressed);
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
num_bytes = max(blocksize, num_bytes);
- orig_num_bytes = num_bytes;
+ disk_num_bytes = num_bytes;
+ total_in = 0;
+ ret = 0;
- if (alloc_hint == EXTENT_MAP_INLINE)
- goto out;
+ /* we do compression for mount -o compress and when the
+ * inode has not been flagged as nocompress
+ */
+ if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+ btrfs_test_opt(root, COMPRESS)) {
+ WARN_ON(pages);
+ pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+ /* we want to make sure the amount of IO required to satisfy
+ * a random read is reasonably small, so we limit the size
+ * of a compressed extent to 128k
+ */
+ ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+ total_compressed, pages,
+ nr_pages, &nr_pages_ret,
+ &total_in,
+ &total_compressed,
+ max_compressed);
+
+ if (!ret) {
+ unsigned long offset = total_compressed &
+ (PAGE_CACHE_SIZE - 1);
+ struct page *page = pages[nr_pages_ret - 1];
+ char *kaddr;
+
+ /* zero the tail end of the last page, we might be
+ * sending it down to disk
+ */
+ if (offset) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0,
+ PAGE_CACHE_SIZE - offset);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ will_compress = 1;
+ }
+ }
+ if (start == 0) {
+ /* lets try to make an inline extent */
+ if (ret || total_in < (end - start + 1)) {
+ /* we didn't compress the entire range, try
+ * to make an uncompressed inline extent. This
+ * is almost sure to fail, but maybe inline sizes
+ * will get bigger later
+ */
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end, 0, NULL);
+ } else {
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end,
+ total_compressed, pages);
+ }
+ if (ret == 0) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ 1, 1, 1);
+ *page_started = 1;
+ ret = 0;
+ goto free_pages_out;
+ }
+ }
+
+ if (will_compress) {
+ /*
+ * we aren't doing an inline extent round the compressed size
+ * up to a block size boundary so the allocator does sane
+ * things
+ */
+ total_compressed = (total_compressed + blocksize - 1) &
+ ~(blocksize - 1);
+
+ /*
+ * one last check to make sure the compression is really a
+ * win, compare the page count read with the blocks on disk
+ */
+ total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+ ~(PAGE_CACHE_SIZE - 1);
+ if (total_compressed >= total_in) {
+ will_compress = 0;
+ } else {
+ disk_num_bytes = total_compressed;
+ num_bytes = total_in;
+ }
+ }
+ if (!will_compress && pages) {
+ /*
+ * the compression code ran but failed to make things smaller,
+ * free any pages it allocated and our page pointer array
+ */
+ for (i = 0; i < nr_pages_ret; i++) {
+ page_cache_release(pages[i]);
+ }
+ kfree(pages);
+ pages = NULL;
+ total_compressed = 0;
+ nr_pages_ret = 0;
+
+ /* flag the file so we don't compress in the future */
+ btrfs_set_flag(inode, NOCOMPRESS);
+ }
+
+ BUG_ON(disk_num_bytes >
+ btrfs_super_total_bytes(&root->fs_info->super_copy));
- BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
mutex_lock(&BTRFS_I(inode)->extent_mutex);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
- while(num_bytes > 0) {
- cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+ while(disk_num_bytes > 0) {
+ unsigned long min_bytes;
+
+ /*
+ * the max size of a compressed extent is pretty small,
+ * make the code a little less complex by forcing
+ * the allocator to find a whole compressed extent at once
+ */
+ if (will_compress)
+ min_bytes = disk_num_bytes;
+ else
+ min_bytes = root->sectorsize;
+
+ cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
- root->sectorsize, 0, alloc_hint,
+ min_bytes, 0, alloc_hint,
(u64)-1, &ins, 1);
if (ret) {
WARN_ON(1);
- goto out;
+ goto free_pages_out_fail;
}
em = alloc_extent_map(GFP_NOFS);
em->start = start;
- em->len = ins.offset;
+
+ if (will_compress) {
+ ram_size = num_bytes;
+ em->len = num_bytes;
+ } else {
+ /* ramsize == disk size */
+ ram_size = ins.offset;
+ em->len = ins.offset;
+ }
+
em->block_start = ins.objectid;
+ em->block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
+
mutex_lock(&BTRFS_I(inode)->extent_mutex);
set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ if (will_compress)
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
while(1) {
spin_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
break;
}
btrfs_drop_extent_cache(inode, start,
- start + ins.offset - 1, 0);
+ start + ram_size - 1, 0);
}
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
cur_alloc_size = ins.offset;
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ins.offset, 0);
+ ram_size, cur_alloc_size, 0,
+ will_compress);
BUG_ON(ret);
- if (num_bytes < cur_alloc_size) {
- printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+
+ if (disk_num_bytes < cur_alloc_size) {
+ printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
cur_alloc_size);
break;
}
+
+ if (will_compress) {
+ /*
+ * we're doing compression, we and we need to
+ * submit the compressed extents down to the device.
+ *
+ * We lock down all the file pages, clearing their
+ * dirty bits and setting them writeback. Everyone
+ * that wants to modify the page will wait on the
+ * ordered extent above.
+ *
+ * The writeback bits on the file pages are
+ * cleared when the compressed pages are on disk
+ */
+ btrfs_end_transaction(trans, root);
+
+ if (start <= page_offset(locked_page) &&
+ page_offset(locked_page) < start + ram_size) {
+ *page_started = 1;
+ }
+
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start,
+ start + ram_size - 1,
+ NULL, 1, 1, 0);
+
+ ret = btrfs_submit_compressed_write(inode, start,
+ ram_size, ins.objectid,
+ cur_alloc_size, pages,
+ nr_pages_ret);
+
+ BUG_ON(ret);
+ trans = btrfs_join_transaction(root, 1);
+ if (start + ram_size < end) {
+ start += ram_size;
+ alloc_hint = ins.objectid + ins.offset;
+ /* pages will be freed at end_bio time */
+ pages = NULL;
+ goto again;
+ } else {
+ /* we've written everything, time to go */
+ break;
+ }
+ }
+ /* we're not doing compressed IO, don't unlock the first
+ * page (which the caller expects to stay locked), don't
+ * clear any dirty bits and don't set any writeback bits
+ */
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, start + ram_size - 1,
+ locked_page, 0, 0, 0);
+ disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
}
+
+ ret = 0;
out:
btrfs_end_transaction(trans, root);
+
return ret;
+
+free_pages_out_fail:
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, end, locked_page, 0, 0, 0);
+free_pages_out:
+ for (i = 0; i < nr_pages_ret; i++)
+ page_cache_release(pages[i]);
+ if (pages)
+ kfree(pages);
+
+ goto out;
}
/*
@@ -203,7 +591,8 @@ out:
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started)
{
u64 extent_start;
u64 extent_end;
@@ -260,6 +649,11 @@ again:
extent_end = extent_start + extent_num_bytes;
err = 0;
+ if (btrfs_file_extent_compression(leaf, item) ||
+ btrfs_file_extent_encryption(leaf,item) ||
+ btrfs_file_extent_other_encoding(leaf, item))
+ goto not_found;
+
if (loops && start != extent_start)
goto not_found;
@@ -284,7 +678,8 @@ again:
bytenr += btrfs_file_extent_offset(leaf, item);
extent_num_bytes = min(end + 1, extent_end) - start;
ret = btrfs_add_ordered_extent(inode, start, bytenr,
- extent_num_bytes, 1);
+ extent_num_bytes,
+ extent_num_bytes, 1, 0);
if (ret) {
err = ret;
goto out;
@@ -300,7 +695,8 @@ again:
not_found:
btrfs_end_transaction(trans, root);
btrfs_free_path(path);
- return cow_file_range(inode, start, end);
+ return cow_file_range(inode, locked_page, start, end,
+ page_started);
}
out:
WARN_ON(err);
@@ -312,16 +708,19 @@ out:
/*
* extent_io.c call back to do delayed allocation processing
*/
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
if (btrfs_test_opt(root, NODATACOW) ||
btrfs_test_flag(inode, NODATACOW))
- ret = run_delalloc_nocow(inode, start, end);
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started);
else
- ret = cow_file_range(inode, start, end);
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started);
return ret;
}
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
* we don't create bios that span stripes or chunks
*/
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
- size_t size, struct bio *bio)
+ size_t size, struct bio *bio,
+ unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
* are inserted into the btree
*/
int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num)
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
* or reading the csums from the tree before a read
*/
int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num)
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
if (!(rw & (1 << BIO_RW))) {
btrfs_lookup_bio_sums(root, inode, bio);
+
+ if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ return btrfs_submit_compressed_read(inode, bio,
+ mirror_num, bio_flags);
+ }
+
goto mapit;
}
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num,
- __btrfs_submit_bio_hook);
+ bio_flags, __btrfs_submit_bio_hook);
mapit:
return btrfs_map_bio(root, rw, bio, mirror_num, 0);
}
@@ -539,7 +945,7 @@ out_page:
* good idea. This causes problems because we want to make sure COW
* properly happens and the data=ordered rules are followed.
*
- * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * In our case any range that doesn't have the ORDERED bit set
* hasn't been properly setup for IO. We kick off an async process
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
ordered_extent->start);
btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
- ordered_extent->len);
+ ordered_extent->disk_len);
btrfs_set_file_extent_offset(leaf, extent_item, 0);
+
+ if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+ btrfs_set_file_extent_compression(leaf, extent_item, 1);
+ else
+ btrfs_set_file_extent_compression(leaf, extent_item, 0);
+ btrfs_set_file_extent_encryption(leaf, extent_item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
+
+ /* ram bytes = extent_num_bytes for now */
btrfs_set_file_extent_num_bytes(leaf, extent_item,
ordered_extent->len);
+ btrfs_set_file_extent_ram_bytes(leaf, extent_item,
+ ordered_extent->len);
btrfs_mark_buffer_dirty(leaf);
btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
ins.objectid = ordered_extent->start;
- ins.offset = ordered_extent->len;
+ ins.offset = ordered_extent->disk_len;
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
int ret;
int rw;
u64 logical;
+ unsigned long bio_flags = 0;
ret = get_state_private(failure_tree, start, &private);
if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
}
logical = start - em->start;
logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ bio_flags = EXTENT_BIO_COMPRESSED;
failrec->logical = logical;
free_extent_map(em);
set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
rw = READ;
BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
- failrec->last_mirror);
+ failrec->last_mirror,
+ bio_flags);
return 0;
}
@@ -1644,10 +2065,8 @@ search_again:
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- struct btrfs_item *item = btrfs_item_nr(leaf,
- path->slots[0]);
item_end += btrfs_file_extent_inline_len(leaf,
- item);
+ fi);
}
item_end--;
}
@@ -1715,7 +2134,14 @@ search_again:
root_owner = btrfs_header_owner(leaf);
}
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- if (!del_item) {
+ /*
+ * we can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_compression(leaf, fi) == 0 &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0) {
u32 size = new_size - found_key.offset;
if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
err = btrfs_insert_file_extent(trans, root,
inode->i_ino,
hole_start, 0, 0,
- hole_size, 0);
+ hole_size, 0, hole_size,
+ 0, 0, 0);
btrfs_drop_extent_cache(inode, hole_start,
(u64)-1, 0);
btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
start_diff = map_start - em->start;
em->start = map_start;
em->len = map_len;
- if (em->block_start < EXTENT_MAP_LAST_BYTE)
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
em->block_start += start_diff;
+ em->block_len -= start_diff;
+ }
return add_extent_mapping(em_tree, em);
}
+static noinline int uncompress_inline(struct btrfs_path *path,
+ struct inode *inode, struct page *page,
+ size_t pg_offset, u64 extent_offset,
+ struct btrfs_file_extent_item *item)
+{
+ int ret;
+ struct extent_buffer *leaf = path->nodes[0];
+ char *tmp;
+ size_t max_size;
+ unsigned long inline_size;
+ unsigned long ptr;
+
+ WARN_ON(pg_offset != 0);
+ max_size = btrfs_file_extent_ram_bytes(leaf, item);
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ btrfs_item_nr(leaf, path->slots[0]));
+ tmp = kmalloc(inline_size, GFP_NOFS);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+ max_size = min(PAGE_CACHE_SIZE, max_size);
+ ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+ inline_size, max_size);
+ if (ret) {
+ char *kaddr = kmap_atomic(page, KM_USER0);
+ unsigned long copy_size = min_t(u64,
+ PAGE_CACHE_SIZE - pg_offset,
+ max_size - extent_offset);
+ memset(kaddr + pg_offset, 0, copy_size);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ kfree(tmp);
+ return 0;
+}
+
/*
* a bit scary, this does extent mapping from logical file offset to the disk.
* the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_trans_handle *trans = NULL;
+ int compressed;
again:
spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
em->bdev = root->fs_info->fs_devices->latest_bdev;
em->start = EXTENT_MAP_HOLE;
em->len = (u64)-1;
+ em->block_len = (u64)-1;
if (!path) {
path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
found_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
+ compressed = btrfs_file_extent_compression(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
em->block_start = EXTENT_MAP_HOLE;
goto insert;
}
- bytenr += btrfs_file_extent_offset(leaf, item);
- em->block_start = bytenr;
em->start = extent_start;
em->len = extent_end - extent_start;
+ if (compressed) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->block_start = bytenr;
+ em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+ item);
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, item);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ }
goto insert;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
u64 page_start;
@@ -3018,8 +3495,7 @@ again:
size_t extent_offset;
size_t copy_size;
- size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
- path->slots[0]));
+ size = btrfs_file_extent_inline_len(leaf, item);
extent_end = (extent_start + size + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
}
em->block_start = EXTENT_MAP_INLINE;
- if (!page) {
+ if (!page || create) {
em->start = extent_start;
- em->len = size;
+ em->len = (size + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
goto out;
}
@@ -3048,11 +3525,22 @@ again:
em->start = extent_start + extent_offset;
em->len = (copy_size + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
- map = kmap(page);
+ if (compressed)
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (create == 0 && !PageUptodate(page)) {
- read_extent_buffer(leaf, map + pg_offset, ptr,
- copy_size);
+ if (btrfs_file_extent_compression(leaf, item) ==
+ BTRFS_COMPRESS_ZLIB) {
+ ret = uncompress_inline(path, inode, page,
+ pg_offset,
+ extent_offset, item);
+ BUG_ON(ret);
+ } else {
+ map = kmap(page);
+ read_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ kunmap(page);
+ }
flush_dcache_page(page);
} else if (create && PageUptodate(page)) {
if (!trans) {
@@ -3063,11 +3551,12 @@ again:
trans = btrfs_join_transaction(root, 1);
goto again;
}
+ map = kmap(page);
write_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
+ kunmap(page);
btrfs_mark_buffer_dirty(leaf);
}
- kunmap(page);
set_extent_uptodate(io_tree, em->start,
extent_map_end(em) - 1, GFP_NOFS);
goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
btrfs_set_file_extent_type(leaf, ei,
BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
ptr = btrfs_file_extent_inline_start(ei);
write_extent_buffer(leaf, symname, ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2eb6caba57c2..b5745bb96d40 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
* inserted.
*/
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, int nocow)
+ u64 start, u64 len, u64 disk_len, int nocow,
+ int compressed)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
+ entry->disk_len = disk_len;
entry->inode = inode;
if (nocow)
set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
+ if (compressed)
+ set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
/* one ref for the tree */
atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
* for pdflush to find them
*/
btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
- if (wait)
+ if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
+ }
}
/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a144..1ef464145d22 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
/* disk byte number */
u64 start;
- /* length of the extent in bytes */
+ /* ram length of the extent in bytes */
u64 len;
+ /* extent length on disk */
+ u64 disk_len;
+
/* flags (described above) */
unsigned long flags;
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
int btrfs_dec_test_ordered_pending(struct inode *inode,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, int nocow);
+ u64 start, u64 len, u64 disk_len, int nocow,
+ int compressed);
int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bd9ab3e9a7f2..64725c13aa11 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
if (btrfs_file_extent_type(l, fi) ==
BTRFS_FILE_EXTENT_INLINE) {
printk("\t\tinline extent data size %u\n",
- btrfs_file_extent_inline_len(l, item));
+ btrfs_file_extent_inline_len(l, fi));
break;
}
printk("\t\textent data disk bytenr %llu nr %llu\n",
(unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
(unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
- printk("\t\textent data offset %llu nr %llu\n",
+ printk("\t\textent data offset %llu nr %llu ram %llu\n",
(unsigned long long)btrfs_file_extent_offset(l, fi),
- (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
+ (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
+ (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e6039825b7b..431fdf144b58 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
#include "volumes.h"
#include "version.h"
#include "export.h"
+#include "compression.h"
#define BTRFS_SUPER_MAGIC 0x9123683E
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
enum {
Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
- Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_err,
+ Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
};
static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
{Opt_max_inline, "max_inline=%s"},
{Opt_alloc_start, "alloc_start=%s"},
{Opt_thread_pool, "thread_pool=%d"},
+ {Opt_compress, "compress"},
{Opt_ssd, "ssd"},
{Opt_noacl, "noacl"},
{Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
+ case Opt_compress:
+ printk(KERN_INFO "btrfs: use compression\n");
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ break;
case Opt_ssd:
printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
err = btrfs_interface_init();
if (err)
goto free_extent_map;
+
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
+ btrfs_zlib_exit();
}
module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf618cc8b34a..e6d579053a47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (found_type == BTRFS_FILE_EXTENT_REG)
extent_end = start + btrfs_file_extent_num_bytes(eb, item);
else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_inline_len(eb,
- btrfs_item_nr(eb, slot));
+ size = btrfs_file_extent_inline_len(eb, item);
extent_end = (start + size + mask) & ~mask;
} else {
ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51a..7db4cfd03a98 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
em->start = key.offset;
em->len = *num_bytes;
em->block_start = 0;
+ em->block_len = em->len;
if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
em->start = logical;
em->len = length;
em->block_start = 0;
+ em->block_len = em->len;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..e99309180a11
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+ avail_out = *dstlen - 12 and flush == Z_FINISH.
+ If it doesn't manage to finish, call it again with
+ avail_in == 0 and avail_out set to the remaining 12
+ bytes for it to clean up.
+ Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+ z_stream inf_strm;
+ z_stream def_strm;
+ char *buf;
+ struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+ struct workspace *workspace;
+ int ret;
+ int cpus = num_online_cpus();
+
+again:
+ spin_lock(&workspace_lock);
+ if (!list_empty(&idle_workspace)) {
+ workspace = list_entry(idle_workspace.next, struct workspace,
+ list);
+ list_del(&workspace->list);
+ num_workspace--;
+ spin_unlock(&workspace_lock);
+ return workspace;
+
+ }
+ spin_unlock(&workspace_lock);
+ if (atomic_read(&alloc_workspace) > cpus) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&alloc_workspace) > cpus)
+ schedule();
+ finish_wait(&workspace_wait, &wait);
+ goto again;
+ }
+ atomic_inc(&alloc_workspace);
+ workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ if (!workspace) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+ if (!workspace->def_strm.workspace) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!workspace->inf_strm.workspace) {
+ ret = -ENOMEM;
+ goto fail_inflate;
+ }
+ workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!workspace->buf) {
+ ret = -ENOMEM;
+ goto fail_kmalloc;
+ }
+ return workspace;
+
+fail_kmalloc:
+ vfree(workspace->inf_strm.workspace);
+fail_inflate:
+ vfree(workspace->def_strm.workspace);
+fail:
+ kfree(workspace);
+ atomic_dec(&alloc_workspace);
+ wake_up(&workspace_wait);
+ return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+ spin_lock(&workspace_lock);
+ if (num_workspace < num_online_cpus()) {
+ list_add_tail(&workspace->list, &idle_workspace);
+ num_workspace++;
+ spin_unlock(&workspace_lock);
+ if (waitqueue_active(&workspace_wait))
+ wake_up(&workspace_wait);
+ return 0;
+ }
+ spin_unlock(&workspace_lock);
+ vfree(workspace->def_strm.workspace);
+ vfree(workspace->inf_strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+
+ atomic_dec(&alloc_workspace);
+ if (waitqueue_active(&workspace_wait))
+ wake_up(&workspace_wait);
+ return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+ struct workspace *workspace;
+ while(!list_empty(&idle_workspace)) {
+ workspace = list_entry(idle_workspace.next, struct workspace,
+ list);
+ list_del(&workspace->list);
+ vfree(workspace->def_strm.workspace);
+ vfree(workspace->inf_strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+ atomic_dec(&alloc_workspace);
+ }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated. There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read. It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ int ret;
+ struct workspace *workspace;
+ char *data_in;
+ char *cpage_out;
+ int nr_pages = 0;
+ struct page *in_page = NULL;
+ struct page *out_page = NULL;
+ int out_written = 0;
+ int in_read = 0;
+ unsigned long bytes_left;
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ workspace = find_zlib_workspace();
+ if (!workspace)
+ return -1;
+
+ if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+ printk(KERN_WARNING "deflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+
+ workspace->def_strm.total_in = 0;
+ workspace->def_strm.total_out = 0;
+
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ cpage_out = kmap(out_page);
+ pages[0] = out_page;
+ nr_pages = 1;
+
+ workspace->def_strm.next_in = data_in;
+ workspace->def_strm.next_out = cpage_out;
+ workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+ out_written = 0;
+ in_read = 0;
+
+ while (workspace->def_strm.total_in < len) {
+ ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+ if (ret != Z_OK) {
+ printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+ ret);
+ zlib_deflateEnd(&workspace->def_strm);
+ ret = -1;
+ goto out;
+ }
+
+ /* we're making it bigger, give up */
+ if (workspace->def_strm.total_in > 8192 &&
+ workspace->def_strm.total_in <
+ workspace->def_strm.total_out) {
+ ret = -1;
+ goto out;
+ }
+ /* we need another page for writing out. Test this
+ * before the total_in so we will pull in a new page for
+ * the stream end if required
+ */
+ if (workspace->def_strm.avail_out == 0) {
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -1;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ cpage_out = kmap(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->def_strm.next_out = cpage_out;
+ }
+ /* we're all done */
+ if (workspace->def_strm.total_in >= len)
+ break;
+
+ /* we've read in a full page, get a new one */
+ if (workspace->def_strm.avail_in == 0) {
+ if (workspace->def_strm.total_out > max_out)
+ break;
+
+ bytes_left = len - workspace->def_strm.total_in;
+ kunmap(in_page);
+ page_cache_release(in_page);
+
+ start += PAGE_CACHE_SIZE;
+ in_page = find_get_page(mapping,
+ start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+ workspace->def_strm.avail_in = min(bytes_left,
+ PAGE_CACHE_SIZE);
+ workspace->def_strm.next_in = data_in;
+ }
+ }
+ workspace->def_strm.avail_in = 0;
+ ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+ zlib_deflateEnd(&workspace->def_strm);
+
+ if (ret != Z_STREAM_END) {
+ ret = -1;
+ goto out;
+ }
+
+ if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+ *total_out = workspace->def_strm.total_out;
+ *total_in = workspace->def_strm.total_in;
+out:
+ *out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
+ page_cache_release(in_page);
+ }
+ free_workspace(workspace);
+ return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous. They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen)
+{
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ struct workspace *workspace;
+ char *data_in;
+ size_t total_out = 0;
+ unsigned long page_bytes_left;
+ unsigned long page_in_index = 0;
+ unsigned long page_out_index = 0;
+ struct page *page_out;
+ unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+ unsigned long working_bytes;
+ unsigned long pg_offset;
+ unsigned long start_byte;
+ unsigned long current_buf_start;
+ char *kaddr;
+
+ workspace = find_zlib_workspace();
+ if (!workspace)
+ return -ENOMEM;
+
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->inf_strm.next_in = data_in;
+ workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+ workspace->inf_strm.total_in = 0;
+
+ workspace->inf_strm.total_out = 0;
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ page_out = bvec[page_out_index].bv_page;
+ page_bytes_left = PAGE_CACHE_SIZE;
+ pg_offset = 0;
+
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->inf_strm.next_in += 2;
+ workspace->inf_strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ printk(KERN_WARNING "inflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+ while(workspace->inf_strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END) {
+ break;
+ }
+
+ /*
+ * buf start is the byte offset we're of the start of
+ * our workspace buffer
+ */
+ buf_start = total_out;
+
+ /* total_out is the last byte of the workspace buffer */
+ total_out = workspace->inf_strm.total_out;
+
+ working_bytes = total_out - buf_start;
+
+ /*
+ * start byte is the first byte of the page we're currently
+ * copying into relative to the start of the compressed data.
+ */
+ start_byte = page_offset(page_out) - disk_start;
+
+ if (working_bytes == 0) {
+ /* we didn't make progress in this inflate
+ * call, we're done
+ */
+ if (ret != Z_STREAM_END)
+ ret = -1;
+ break;
+ }
+
+ /* we haven't yet hit data corresponding to this page */
+ if (total_out <= start_byte) {
+ goto next;
+ }
+
+ /*
+ * the start of the data we care about is offset into
+ * the middle of our working buffer
+ */
+ if (total_out > start_byte && buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes -= buf_offset;
+ } else {
+ buf_offset = 0;
+ }
+ current_buf_start = buf_start;
+
+ /* copy bytes from the working buffer into the pages */
+ while(working_bytes > 0) {
+ bytes = min(PAGE_CACHE_SIZE - pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, working_bytes);
+ kaddr = kmap_atomic(page_out, KM_USER0);
+ memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+ bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page_out);
+
+ pg_offset += bytes;
+ page_bytes_left -= bytes;
+ buf_offset += bytes;
+ working_bytes -= bytes;
+ current_buf_start += bytes;
+
+ /* check if we need to pick another page */
+ if (page_bytes_left == 0) {
+ page_out_index++;
+ if (page_out_index >= vcnt) {
+ ret = 0;
+ goto done;
+ }
+ page_out = bvec[page_out_index].bv_page;
+ pg_offset = 0;
+ page_bytes_left = PAGE_CACHE_SIZE;
+ start_byte = page_offset(page_out) - disk_start;
+
+ /*
+ * make sure our new page is covered by this
+ * working buffer
+ */
+ if (total_out <= start_byte) {
+ goto next;
+ }
+
+ /* the next page in the biovec might not
+ * be adjacent to the last page, but it
+ * might still be found inside this working
+ * buffer. bump our offset pointer
+ */
+ if (total_out > start_byte &&
+ current_buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes = total_out - start_byte;
+ current_buf_start = buf_start +
+ buf_offset;
+ }
+ }
+ }
+next:
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+ if (workspace->inf_strm.avail_in == 0) {
+ unsigned long tmp;
+ kunmap(pages_in[page_in_index]);
+ page_in_index++;
+ if (page_in_index >= total_pages_in) {
+ data_in = NULL;
+ break;
+ }
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->inf_strm.next_in = data_in;
+ tmp = srclen - workspace->inf_strm.total_in;
+ workspace->inf_strm.avail_in = min(tmp,
+ PAGE_CACHE_SIZE);
+ }
+ }
+ if (ret != Z_STREAM_END) {
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+done:
+ zlib_inflateEnd(&workspace->inf_strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
+out:
+ free_workspace(workspace);
+ return ret;
+}
+
+/*
+ * a less complex decompression routine. Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen)
+{
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ struct workspace *workspace;
+ unsigned long bytes_left = destlen;
+ unsigned long total_out = 0;
+ char *kaddr;
+
+ if (destlen > PAGE_CACHE_SIZE)
+ return -ENOMEM;
+
+ workspace = find_zlib_workspace();
+ if (!workspace)
+ return -ENOMEM;
+
+ workspace->inf_strm.next_in = data_in;
+ workspace->inf_strm.avail_in = srclen;
+ workspace->inf_strm.total_in = 0;
+
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->inf_strm.total_out = 0;
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->inf_strm.next_in += 2;
+ workspace->inf_strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ printk(KERN_WARNING "inflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+
+ while(bytes_left > 0) {
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+ unsigned long pg_offset = 0;
+
+ ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END) {
+ break;
+ }
+
+ buf_start = total_out;
+ total_out = workspace->inf_strm.total_out;
+
+ if (total_out == buf_start) {
+ ret = -1;
+ break;
+ }
+
+ if (total_out <= start_byte) {
+ goto next;
+ }
+
+ if (total_out > start_byte && buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ } else {
+ buf_offset = 0;
+ }
+
+ bytes = min(PAGE_CACHE_SIZE - pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, bytes_left);
+
+ kaddr = kmap_atomic(dest_page, KM_USER0);
+ memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ pg_offset += bytes;
+ bytes_left -= bytes;
+next:
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ }
+ if (ret != Z_STREAM_END && bytes_left != 0) {
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+ zlib_inflateEnd(&workspace->inf_strm);
+out:
+ free_workspace(workspace);
+ return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+ free_workspaces();
+}