diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-19 14:36:00 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-19 14:36:00 -0800 |
commit | 2b9fb532d4168e8974fe49709e2c4c8d5352a64c (patch) | |
tree | 610cbe2d1bb32e28db135a767f158ade31452e2e /fs/btrfs | |
parent | 4533f6e27a366ecc3da4876074ebfe0cc0ea4f0f (diff) | |
parent | a742994aa2e271eb8cd8e043d276515ec858ed73 (diff) | |
download | linux-stable-2b9fb532d4168e8974fe49709e2c4c8d5352a64c.tar.gz linux-stable-2b9fb532d4168e8974fe49709e2c4c8d5352a64c.tar.bz2 linux-stable-2b9fb532d4168e8974fe49709e2c4c8d5352a64c.zip |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason:
"This pull is mostly cleanups and fixes:
- The raid5/6 cleanups from Zhao Lei fixup some long standing warts
in the code and add improvements on top of the scrubbing support
from 3.19.
- Josef has round one of our ENOSPC fixes coming from large btrfs
clusters here at FB.
- Dave Sterba continues a long series of cleanups (thanks Dave), and
Filipe continues hammering on corner cases in fsync and others
This all was held up a little trying to track down a use-after-free in
btrfs raid5/6. It's not clear yet if this is just made easier to
trigger with this pull or if its a new bug from the raid5/6 cleanups.
Dave Sterba is the only one to trigger it so far, but he has a
consistent way to reproduce, so we'll get it nailed shortly"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (68 commits)
Btrfs: don't remove extents and xattrs when logging new names
Btrfs: fix fsync data loss after adding hard link to inode
Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group
Btrfs: account for large extents with enospc
Btrfs: don't set and clear delalloc for O_DIRECT writes
Btrfs: only adjust outstanding_extents when we do a short write
btrfs: Fix out-of-space bug
Btrfs: scrub, fix sleep in atomic context
Btrfs: fix scheduler warning when syncing log
Btrfs: Remove unnecessary placeholder in btrfs_err_code
btrfs: cleanup init for list in free-space-cache
btrfs: delete chunk allocation attemp when setting block group ro
btrfs: clear bio reference after submit_one_bio()
Btrfs: fix scrub race leading to use-after-free
Btrfs: add missing cleanup on sysfs init failure
Btrfs: fix race between transaction commit and empty block group removal
btrfs: add more checks to btrfs_read_sys_array
btrfs: cleanup, rename a few variables in btrfs_read_sys_array
btrfs: add checks for sys_chunk_array sizes
btrfs: more superblock checks, lower bounds on devices and sectorsize/nodesize
...
Diffstat (limited to 'fs/btrfs')
33 files changed, 1062 insertions, 859 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8729cf68d2fe..f55721ff9385 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, return ret; } -/* - * this makes the path point to (inum INODE_ITEM ioff) - */ -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path) -{ - struct btrfs_key key; - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_ITEM_KEY, &key); -} - -static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path, - struct btrfs_key *found_key) -{ - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_REF_KEY, found_key); -} - int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, @@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } - ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + ret = btrfs_find_item(fs_root, path, parent, 0, + BTRFS_INODE_REF_KEY, &found_key); if (ret > 0) ret = -ENOENT; if (ret) @@ -1727,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); + ret = btrfs_find_item(fs_root, path, inum, + parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY, + &found_key); + if (ret < 0) break; if (ret) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2a1ac6bfc724..9c41fbac3009 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -32,9 +32,6 @@ struct inode_fs_paths { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path); - int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4aadadcfab20..de5e4f2adfea 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -185,6 +185,9 @@ struct btrfs_inode { struct btrfs_delayed_node *delayed_node; + /* File creation time. */ + struct timespec i_otime; + struct inode vfs_inode; }; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 14a72ed14ef7..993642199326 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; + spin_lock(&root->fs_info->trans_lock); - if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && - list_empty(&root->dirty_list)) { - list_add(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); } spin_unlock(&root->fs_info->trans_lock); } @@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(eb->start, - fs_info->tree_root->nodesize); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(logical, root->nodesize); + eb = alloc_dummy_extent_buffer(root->fs_info, logical); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize); + readahead_tree_block(root, search); nread += blocksize; } nscan++; @@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int blocksize; parent = path->nodes[level + 1]; if (!parent) @@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = root->nodesize; if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); @@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root, } if (block1) - readahead_tree_block(root, block1, blocksize); + readahead_tree_block(root, block1); if (block2) - readahead_tree_block(root, block2, blocksize); + readahead_tree_block(root, block2); } @@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key, return 0; } -int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 iobjectid, u64 ioff, u8 key_type, struct btrfs_key *found_key) { int ret; struct btrfs_key key; struct extent_buffer *eb; - struct btrfs_path *path; + + ASSERT(path); + ASSERT(found_key); key.type = key_type; key.objectid = iobjectid; key.offset = ioff; - if (found_path == NULL) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } else - path = found_path; - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if ((ret < 0) || (found_key == NULL)) { - if (path != found_path) - btrfs_free_path(path); + if (ret < 0) return ret; - } eb = path->nodes[0]; if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { @@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; path->slots[level] = 0; return 0; } @@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, path->search_for_split = 1; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret > 0) + ret = -EAGAIN; if (ret < 0) goto err; ret = -EAGAIN; leaf = path->nodes[0]; - /* if our item isn't there or got smaller, return now */ - if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b180708bf79..84c3b00f3de8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -1020,6 +1022,9 @@ enum btrfs_raid_types { BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) +#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6) + /* * We need a bit for restriper to be able to tell when chunks of type * SINGLE are available. This "extended" profile format is used in @@ -1239,7 +1244,6 @@ enum btrfs_disk_cache_state { BTRFS_DC_ERROR = 1, BTRFS_DC_CLEAR = 2, BTRFS_DC_SETUP = 3, - BTRFS_DC_NEED_WRITE = 4, }; struct btrfs_caching_control { @@ -1277,7 +1281,6 @@ struct btrfs_block_group_cache { unsigned long full_stripe_len; unsigned int ro:1; - unsigned int dirty:1; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; @@ -1315,6 +1318,9 @@ struct btrfs_block_group_cache { struct list_head ro_list; atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; }; /* delayed seq elem */ @@ -1741,6 +1747,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; @@ -1776,6 +1783,7 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_DEFRAG_RUNNING 6 #define BTRFS_ROOT_FORCE_COW 7 #define BTRFS_ROOT_MULTI_LOG_TASKS 8 +#define BTRFS_ROOT_DIRTY 9 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1794,8 +1802,6 @@ struct btrfs_root { struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; - struct kobject root_kobj; - struct completion kobj_unregister; struct mutex objectid_mutex; spinlock_t accounting_lock; @@ -2465,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); - -static inline struct btrfs_timespec * -btrfs_inode_atime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, atime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_mtime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, mtime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_ctime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, ctime); - return (struct btrfs_timespec *)ptr; -} - BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index de4e70fb3cbb..82f0c7c95474 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); btrfs_set_stack_inode_block_group(inode_item, 0); - btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->atime, inode->i_atime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->atime, inode->i_atime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->mtime, inode->i_mtime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->mtime, inode->i_mtime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->ctime, inode->i_ctime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->ctime, inode->i_ctime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) { struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) @@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ca6a3a3b6b6c..5ec03d999c37 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -440,18 +440,9 @@ leave: */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { - s64 writers; - DEFINE_WAIT(wait); - set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - do { - prepare_to_wait(&fs_info->replace_wait, &wait, - TASK_UNINTERRUPTIBLE); - writers = percpu_counter_sum(&fs_info->bio_counter); - if (writers) - schedule(); - finish_wait(&fs_info->replace_wait, &wait); - } while (writers); + wait_event(fs_info->replace_wait, !percpu_counter_sum( + &fs_info->bio_counter)); } /* @@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) { - DEFINE_WAIT(wait); -again: - percpu_counter_inc(&fs_info->bio_counter); - if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + while (1) { + percpu_counter_inc(&fs_info->bio_counter); + if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state))) + break; + btrfs_bio_counter_dec(fs_info); wait_event(fs_info->replace_wait, !test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)); - goto again; } - } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1afb18226da8..f79f38542a73 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO + printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", root->fs_info->sb->s_id, buf->start, @@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + printk_ratelimited(KERN_ERR + "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", eb->fs_info->sb->s_id, eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " "%llu %llu\n", eb->fs_info->sb->s_id, found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_info(root->fs_info, "bad tree block level %d", + btrfs_err(root->fs_info, "bad tree block level %d", (int)btrfs_header_level(eb)); ret = -EIO; goto err; @@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) +void readahead_tree_block(struct btrfs_root *root, u64 bytenr) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, @@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) free_extent_buffer(buf); } -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb) { struct extent_buffer *buf = NULL; @@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return 0; @@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr) { if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr, - blocksize); - return alloc_extent_buffer(root->fs_info, bytenr, blocksize); + return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return NULL; @@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - memset(&root->root_kobj, 0, sizeof(root->root_kobj)); if (fs_info) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; - init_completion(&root->kobj_unregister); root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, bool check_ref) { struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) @@ -1669,8 +1669,17 @@ again: if (ret) goto fail; - ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, - location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto fail; + } + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + btrfs_free_path(path); if (ret < 0) goto fail; if (ret == 0) @@ -2232,6 +2241,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); @@ -2496,7 +2506,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_ERR "BTRFS: has skinny extents\n"); + printk(KERN_INFO "BTRFS: has skinny extents\n"); /* * flag our filesystem as having big metadata blocks if @@ -2520,7 +2530,7 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " + printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); goto fail_alloc; @@ -2628,12 +2638,12 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(sectorsize); if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); + printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " + printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2642,7 +2652,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read the system " + printk(KERN_ERR "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2657,7 +2667,7 @@ int open_ctree(struct super_block *sb, generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2669,7 +2679,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2681,7 +2691,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", + printk(KERN_ERR "BTRFS: failed to read devices on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2765,7 +2775,7 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to recover balance\n"); + printk(KERN_ERR "BTRFS: failed to recover balance\n"); goto fail_block_groups; } @@ -3860,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", btrfs_super_log_root(sb)); + /* + * Check the lower bound, the alignment and other constraints are + * checked later. + */ + if (btrfs_super_nodesize(sb) < 4096) { + printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", + btrfs_super_nodesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_sectorsize(sb) < 4096) { + printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", + btrfs_super_sectorsize(sb)); + ret = -EINVAL; + } + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", fs_info->fsid, sb->dev_item.fsid); @@ -3873,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + printk(KERN_ERR "BTRFS: number of devices is 0\n"); + ret = -EINVAL; + } if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", @@ -3881,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 414651821fb3..27d44c0fd236 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,11 +46,11 @@ struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u64 parent_transid); -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +void readahead_tree_block(struct btrfs_root *root, u64 bytenr); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); int open_ctree(struct super_block *sb, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a684086c3c81..571f402d3fc4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -74,8 +74,9 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(bbio); + btrfs_put_bbio(bbio); } if (actual_bytes) @@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - int run_most = 0; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) { + if (count == 0) count = atomic_read(&delayed_refs->num_entries) * 2; - run_most = 1; - } again: #ifdef SCRAMBLE_DELAYED_REFS @@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; - int err = 0; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; struct btrfs_path *path; - u64 last = 0; + + if (list_empty(&cur_trans->dirty_bgs)) + return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); - } - - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - - if (cache->dirty) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; - - err = write_one_cache_group(trans, root, path, cache); - btrfs_put_block_group(cache); - if (err) /* File system offline */ - goto out; - } - - while (1) { - /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - err = btrfs_write_out_cache(root, trans, cache, path); - - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + list_del_init(&cache->dirty_list); + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, + (unsigned long) -1); + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) + btrfs_write_out_cache(root, trans, cache, path); + if (!ret) + ret = write_one_cache_group(trans, root, path, cache); btrfs_put_block_group(cache); } -out: btrfs_free_path(path); - return err; + return ret; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for + * @num_bytes: the number of bytes we're relaseing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode) +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; + unsigned num_extents = 0; - BUG_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; + num_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + ASSERT(num_extents); + ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); + BTRFS_I(inode)->outstanding_extents -= num_extents; if (BTRFS_I(inode)->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, @@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) out_fail: spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers @@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); @@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; @@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root, cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, unpin = &fs_info->freed_extents[0]; while (1) { + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, @@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } @@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(root, bytenr, num_bytes, 0); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_group_cache *cache = NULL; int pin = 1; int ret; @@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) goto out; } + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); goto out; } @@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } @@ -6253,7 +6194,6 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - btrfs_put_block_group(cache); } /* Can return -ENOMEM */ @@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->nodesize, 1); + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize, int level) + u64 bytenr, int level) { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); @@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - blocksize, level); + level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, - blocksize, level); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { @@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(root, bytenr, blocksize); + readahead_tree_block(root, bytenr); nread++; } wc->reada_slot = slot; @@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root, bytenr); if (!next) { - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, @@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache->flags); + check_system_chunk(trans, root, alloc_flags); + } + btrfs_end_transaction(trans, root); return ret; } @@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - cache->disk_cache_state = BTRFS_DC_CLEAR; if (btrfs_test_opt(root, SPACE_CACHE)) - cache->dirty = 1; + cache->disk_cache_state = BTRFS_DC_CLEAR; } read_extent_buffer(leaf, &cache->item, @@ -9460,6 +9397,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); @@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ - trans = btrfs_join_transaction(root); + /* 1 for btrfs_orphan_reserve_metadata() */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_set_block_group_rw(root, block_group); ret = PTR_ERR(trans); @@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group->key.objectid; end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group->pinned = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c73df6a7c9b6..c7233ff1d533 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n", + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), atomic_read(&state->refs)); @@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits); + struct extent_state *state, unsigned *bits); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned long *bits) + unsigned *bits) { struct rb_node *node; @@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits, int wake) + unsigned *bits, int wake) { struct extent_state *next; - unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { @@ -789,9 +789,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { - unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { @@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree, static void cache_state_if_flags(struct extent_state *state, struct extent_state **cached_ptr, - const u64 flags) + unsigned flags) { if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { @@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state, static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long exclusive_bits, + unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask) { @@ -1034,7 +1034,7 @@ search_again: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 * failed_start, + unsigned bits, u64 * failed_start, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, @@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; @@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached_state) + unsigned bits, struct extent_state **cached_state) { int err; u64 failed_start; + while (1) { err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, @@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) */ static struct extent_state * find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, unsigned long bits) + u64 start, unsigned bits) { struct rb_node *node; struct extent_state *state; @@ -1474,7 +1475,7 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1753,7 +1754,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long clear_bits, + unsigned clear_bits, unsigned long page_ops) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits, int contig) + unsigned bits, int contig) { struct rb_node *node; struct extent_state *state; @@ -1928,7 +1929,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, struct extent_state *cached) + unsigned bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, sector = bbio->stripes[mirror_num-1].physical >> 9; bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; - kfree(bbio); + btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { bio_put(bio); return -EIO; @@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); - if (ret < 0) + if (ret < 0) { + *bio_ret = NULL; return ret; + } bio = NULL; } else { return 0; @@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, page, &delalloc_start, &delalloc_end, - 128 * 1024 * 1024); + BTRFS_MAX_EXTENT_SIZE); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; @@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) static struct extent_buffer * __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len, gfp_t mask) + unsigned long len) { struct extent_buffer *eb = NULL; - eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); if (eb == NULL) return NULL; eb->start = start; @@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) struct extent_buffer *new; unsigned long num_pages = num_extent_pages(src->start, src->len); - new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); + new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) return NULL; @@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - unsigned long num_pages = num_extent_pages(0, len); + unsigned long len; + unsigned long num_pages; unsigned long i; - eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + num_pages = num_extent_pages(0, len); + + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; @@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(start, len); + eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) return NULL; eb->fs_info = fs_info; @@ -4808,8 +4824,9 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { + unsigned long len = fs_info->tree_root->nodesize; unsigned long num_pages = num_extent_pages(start, len); unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; @@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ece9ce87edff..695b0ccfb755 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -4,22 +4,22 @@ #include <linux/rbtree.h> /* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) -#define EXTENT_DEFRAG (1 << 6) -#define EXTENT_BOUNDARY (1 << 9) -#define EXTENT_NODATASUM (1 << 10) -#define EXTENT_DO_ACCOUNTING (1 << 11) -#define EXTENT_FIRST_DELALLOC (1 << 12) -#define EXTENT_NEED_WAIT (1 << 13) -#define EXTENT_DAMAGED (1 << 14) -#define EXTENT_NORESERVE (1 << 15) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) +#define EXTENT_DIRTY (1U << 0) +#define EXTENT_WRITEBACK (1U << 1) +#define EXTENT_UPTODATE (1U << 2) +#define EXTENT_LOCKED (1U << 3) +#define EXTENT_NEW (1U << 4) +#define EXTENT_DELALLOC (1U << 5) +#define EXTENT_DEFRAG (1U << 6) +#define EXTENT_BOUNDARY (1U << 9) +#define EXTENT_NODATASUM (1U << 10) +#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_FIRST_DELALLOC (1U << 12) +#define EXTENT_NEED_WAIT (1U << 13) +#define EXTENT_DAMAGED (1U << 14) +#define EXTENT_NORESERVE (1U << 15) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* * flags for bio submission. The high bits indicate the compression @@ -81,9 +81,9 @@ struct extent_io_ops { int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -108,7 +108,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; atomic_t refs; - unsigned long state; + unsigned state; /* for use by the FS */ u64 private; @@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, int try_release_extent_buffer(struct page *page); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached); + unsigned bits, struct extent_state **cached); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); @@ -202,21 +202,21 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits, int contig); + u64 max_bytes, unsigned bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, + unsigned bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 *failed_start, + unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); @@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); + u64 start); +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long bits_to_clear, + unsigned bits_to_clear, unsigned long page_ops); struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, @@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); + u64 start); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d6c03f7f136b..a71978578fa7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; - struct list_head bitmaps; + LIST_HEAD(bitmaps); u64 num_entries; u64 num_bitmaps; u64 generation; u8 type; int ret = 0; - INIT_LIST_HEAD(&bitmaps); - /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) return 0; @@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; + enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); if (ret) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_ERROR; - spin_unlock(&block_group->lock); + dcs = BTRFS_DC_ERROR; ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, @@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, #endif } + spin_lock(&block_group->lock); + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); iput(inode); return ret; } @@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root, trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, min_bytes); - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 8ffa4783cbf4..265e03c73f4d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); ret = 0; } else if (ret < 0) { - if (ret == -EOVERFLOW) - ret = -EMLINK; + if (ret == -EOVERFLOW) { + if (find_name_in_backref(path, name, name_len, &ref)) + ret = -EEXIST; + else + ret = -EMLINK; + } goto out; } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 54bcf639d1cf..a85c23dfcddb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static void btrfs_split_extent_hook(struct inode *inode, struct extent_state *orig, u64 split) { + u64 size; + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; + size = orig->end - orig->start + 1; + if (size > BTRFS_MAX_EXTENT_SIZE) { + u64 num_extents; + u64 new_size; + + /* + * We need the largest size of the remaining extent to see if we + * need to add a new outstanding extent. Think of the following + * case + * + * [MEAX_EXTENT_SIZEx2 - 4k][4k] + * + * The new_size would just be 4k and we'd think we had enough + * outstanding extents for this if we only took one side of the + * split, same goes for the other direction. We need to see if + * the larger size still is the same amount of extents as the + * original size, because if it is we need to add a new + * outstanding extent. But if we split up and the larger size + * is less than the original then we are good to go since we've + * already accounted for the extra extent in our original + * accounting. + */ + new_size = orig->end - split + 1; + if ((split - orig->start) > new_size) + new_size = split - orig->start; + + num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) < num_extents) + return; + } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); @@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode, struct extent_state *new, struct extent_state *other) { + u64 new_size, old_size; + u64 num_extents; + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; + old_size = other->end - other->start + 1; + new_size = old_size + (new->end - new->start + 1); + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + return; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) > num_extents) + return; + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents--; spin_unlock(&BTRFS_I(inode)->lock); @@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * have pending delalloc work to be done. */ static void btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode, */ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { u64 len = state->end + 1 - state->start; + u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, + BTRFS_MAX_EXTENT_SIZE); spin_lock(&BTRFS_I(inode)->lock); if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) @@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, *bits &= ~EXTENT_FIRST_DELALLOC; } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + BTRFS_I(inode)->outstanding_extents -= num_extents; spin_unlock(&BTRFS_I(inode)->lock); } @@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode, return 0; zeroit: if (__ratelimit(&_rs)) - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_warn(BTRFS_I(inode)->root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); @@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) - btrfs_crit(root->fs_info, + btrfs_err(root->fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; @@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; unsigned long ptr; @@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode) i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3656,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); + btrfs_set_token_timespec_sec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec, &token); + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), &token); btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -5007,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; + struct btrfs_key key; int ret; int err = 0; @@ -5017,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_item(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid, BTRFS_ROOT_REF_KEY, NULL); + key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path, + 0, 0); if (ret) { if (ret < 0) err = ret; @@ -5258,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; return inode; } @@ -5826,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, @@ -7134,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; + u64 orig_len = len; int unlock_bits = EXTENT_LOCKED; int ret = 0; if (create) - unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; + unlock_bits |= EXTENT_DIRTY; else len = min_t(u64, len, root->sectorsize); @@ -7269,14 +7349,12 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - BUG_ON(ret); + if (len < orig_len) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } + btrfs_free_reserved_data_space(inode, len); } /* @@ -7805,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; @@ -8053,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); - else - btrfs_delalloc_release_metadata(inode, 0); } out: if (wakeup) @@ -8575,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; + ei->i_otime.tv_sec = 0; + ei->i_otime.tv_nsec = 0; + inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 48b60dbf807f..97159a8e91d4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup = u64_to_ptr(unode->aux); qgroup->rfer += sign * oper->num_bytes; qgroup->rfer_cmpr += sign * oper->num_bytes; + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); qgroup->excl += sign * oper->num_bytes; - if (sign < 0) - WARN_ON(qgroup->excl < oper->num_bytes); qgroup->excl_cmpr += sign * oper->num_bytes; qgroup_dirty(fs_info, qgroup); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8ab2a17bbba8..5264858ed768 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -58,15 +58,6 @@ */ #define RBIO_CACHE_READY_BIT 3 -/* - * bbio and raid_map is managed by the caller, so we shouldn't free - * them here. And besides that, all rbios with this flag should not - * be cached, because we need raid_map to check the rbios' stripe - * is the same or not, but it is very likely that the caller has - * free raid_map, so don't cache those rbios. - */ -#define RBIO_HOLD_BBIO_MAP_BIT 4 - #define RBIO_CACHE_SIZE 1024 enum btrfs_rbio_ops { @@ -79,13 +70,6 @@ struct btrfs_raid_bio { struct btrfs_fs_info *fs_info; struct btrfs_bio *bbio; - /* - * logical block numbers for the start of each stripe - * The last one or two are p/q. These are sorted, - * so raid_map[0] is the start of our full stripe - */ - u64 *raid_map; - /* while we're doing rmw on a stripe * we put it into a hash table so we can * lock the stripe and merge more rbios @@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->raid_map[0]; + u64 num = rbio->bbio->raid_map[0]; /* * we shift down quite a bit. We're using byte @@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->raid_map[0] != - cur->raid_map[0]) + if (last->bbio->raid_map[0] != + cur->bbio->raid_map[0]) return 0; /* we can't merge with different operations */ @@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { walk++; - if (cur->raid_map[0] == rbio->raid_map[0]) { + if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { spin_lock(&cur->bio_list_lock); /* can we steal this cached rbio's pages? */ @@ -841,21 +825,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static inline void -__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need) -{ - if (need) { - kfree(raid_map); - kfree(bbio); - } -} - -static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio) -{ - __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map, - !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); -} - static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; @@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) } } - free_bbio_and_raid_map(rbio); - + btrfs_put_bbio(rbio->bbio); kfree(rbio); } @@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; @@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bbio = bbio; - rbio->raid_map = raid_map; rbio->fs_info = root->fs_info; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; @@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, rbio->bio_pages = p + sizeof(struct page *) * num_pages; rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; - if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + nr_data = real_stripes - 1; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) nr_data = real_stripes - 2; else - nr_data = real_stripes - 1; + BUG(); rbio->nr_data = nr_data; return rbio; @@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { start = (u64)bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->raid_map[0]; + stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bio->bi_vcnt; i++) { @@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, logical <<= 9; for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->raid_map[i]; + stripe_start = rbio->bbio->raid_map[i]; if (logical >= stripe_start && logical < stripe_start + rbio->stripe_len) { return i; @@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) * our main entry point for writes from the rest of the FS. */ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio_and_raid_map(bbio, raid_map, 1); + btrfs_put_bbio(bbio); return PTR_ERR(rbio); } bio_list_add(&rbio->bio_list, bio); @@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->raid_map[rbio->real_stripes - 1] == - RAID6_Q_STRIPE) { - + if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* * single failure, rebuild from parity raid5 * style @@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * here due to a crc mismatch and we can't give them the * data they want */ - if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bbio->raid_map[faila] == + RAID5_P_STRIPE) { err = -EIO; goto cleanup; } @@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto pstripe; } - if (rbio->raid_map[failb] == RAID5_P_STRIPE) { + if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { @@ -2001,8 +1967,7 @@ cleanup: cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == 0 && - !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)) + if (err == 0) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -2156,15 +2121,16 @@ cleanup: * of the drive. */ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int generic_io) + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io) { struct btrfs_raid_bio *rbio; int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio_and_raid_map(bbio, raid_map, generic_io); + if (generic_io) + btrfs_put_bbio(bbio); return PTR_ERR(rbio); } @@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); - __free_bbio_and_raid_map(bbio, raid_map, generic_io); + if (generic_io) + btrfs_put_bbio(bbio); kfree(rbio); return -EIO; } @@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, btrfs_bio_counter_inc_noblocked(root->fs_info); rbio->generic_bio_cnt = 1; } else { - set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); + btrfs_get_bbio(bbio); } /* @@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work) struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, int stripe_offset; int index; - ASSERT(logical >= rbio->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + + ASSERT(logical >= rbio->bbio->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + rbio->stripe_len * rbio->nr_data); - stripe_offset = (int)(logical - rbio->raid_map[0]); + stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); index = stripe_offset >> PAGE_CACHE_SHIFT; rbio->bio_pages[index] = page; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 31d4a157b5e3..2b5d7977d83b 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -43,16 +43,15 @@ struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int generic_io); + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io); int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len); + struct btrfs_bio *bbio, u64 stripe_len); struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, u64 logical); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b63ae20618fb..0e7beea92b4c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - u32 blocksize; int err; struct list_head extctl; int refcnt; @@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, blocksize = root->nodesize; re->logical = logical; - re->blocksize = blocksize; re->top = *top; INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); @@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace); - kfree(bbio); + btrfs_put_bbio(bbio); return re; error: @@ -488,7 +486,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - kfree(bbio); + btrfs_put_bbio(bbio); kfree(re); return re_exist; } @@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, int mirror_num = 0; struct extent_buffer *eb = NULL; u64 logical; - u32 blocksize; int ret; int i; int need_kick = 0; @@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); return 0; } - dev->reada_next = re->logical + re->blocksize; + dev->reada_next = re->logical + fs_info->tree_root->nodesize; re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } } logical = re->logical; - blocksize = re->blocksize; spin_lock(&re->lock); if (re->scheduled_for == NULL) { @@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, return 0; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, - mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, + mirror_num, &eb); if (ret) __readahead_hook(fs_info->extent_root, NULL, logical, ret); else if (eb) @@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) break; printk(KERN_DEBUG " re: logical %llu size %u empty %d for %lld", - re->logical, re->blocksize, + re->logical, fs_info->tree_root->nodesize, list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); @@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } printk(KERN_DEBUG "re: logical %llu size %u list empty %d for %lld", - re->logical, re->blocksize, list_empty(&re->extctl), + re->logical, fs_info->tree_root->nodesize, + list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74257d6436ad..d83085381bcc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc, } } -static int tree_block_processed(u64 bytenr, u32 blocksize, - struct reloc_control *rc) +static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { + u32 blocksize = rc->extent_root->nodesize; + if (test_range_bit(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; @@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid); + readahead_tree_block(rc->extent_root, block->bytenr); rb_node = rb_next(rb_node); } @@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc, bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, SKINNY_METADATA); - if (tree_block_processed(bytenr, blocksize, rc)) + if (tree_block_processed(bytenr, rc)) return 0; if (tree_search(blocks, bytenr)) @@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc, if (added) goto next; - if (!tree_block_processed(leaf->start, leaf->len, rc)) { + if (!tree_block_processed(leaf->start, rc)) { block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) { err = -ENOMEM; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e427cb7ee12c..ec57687c9a4d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -66,7 +66,6 @@ struct scrub_ctx; struct scrub_recover { atomic_t refs; struct btrfs_bio *bbio; - u64 *raid_map; u64 map_length; }; @@ -80,7 +79,7 @@ struct scrub_page { u64 logical; u64 physical; u64 physical_for_dev_replace; - atomic_t ref_count; + atomic_t refs; struct { unsigned int mirror_num:8; unsigned int have_csum:1; @@ -113,7 +112,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t ref_count; /* free mem on transition to zero */ + atomic_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -142,7 +141,7 @@ struct scrub_parity { int stripe_len; - atomic_t ref_count; + atomic_t refs; struct list_head spages; @@ -194,6 +193,15 @@ struct scrub_ctx { */ struct btrfs_scrub_progress stat; spinlock_t stat_lock; + + /* + * Use a ref counter to avoid use-after-free issues. Scrub workers + * decrement bios_in_flight and workers_pending and then do a wakeup + * on the list_wait wait queue. We must ensure the main scrub task + * doesn't free the scrub context before or while the workers are + * doing the wakeup() call. + */ + atomic_t refs; }; struct scrub_fixup_nodatasum { @@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, @@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, const u8 *csum, u64 generation, u16 csum_size); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write); + struct scrub_block *sblock_good); static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); @@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void copy_nocow_pages_worker(struct btrfs_work *work); static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { + atomic_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx) { atomic_dec(&sctx->bios_in_flight); wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); } static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) @@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + atomic_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) atomic_dec(&sctx->workers_pending); wake_up(&fs_info->scrub_pause_wait); wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); } static void scrub_free_csums(struct scrub_ctx *sctx) @@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sctx); } +static void scrub_put_ctx(struct scrub_ctx *sctx) +{ + if (atomic_dec_and_test(&sctx->refs)) + scrub_free_ctx(sctx); +} + static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { @@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; @@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key root_key; + struct btrfs_key key; root_key.objectid = root; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, goto err; } - ret = inode_item_info(inum, 0, local_root, swarn->path); + /* + * this makes the path point to (inum INODE_ITEM ioff) + */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { btrfs_release_path(swarn->path); goto err; @@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover) static inline void scrub_put_recover(struct scrub_recover *recover) { if (atomic_dec_and_test(&recover->refs)) { - kfree(recover->bbio); - kfree(recover->raid_map); + btrfs_put_bbio(recover->bbio); kfree(recover); } } @@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, - logical, sblocks_for_recheck); + ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; @@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; -nodatasum_case: WARN_ON(sctx->is_dev_replace); +nodatasum_case: + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified @@ -1091,76 +1114,20 @@ nodatasum_case: sblock_other->no_io_error_seen) { if (sctx->is_dev_replace) { scrub_write_block_to_dev_replace(sblock_other); + goto corrected_error; } else { - int force_write = is_metadata || have_csum; - ret = scrub_repair_block_from_good_copy( - sblock_bad, sblock_other, - force_write); + sblock_bad, sblock_other); + if (!ret) + goto corrected_error; } - if (0 == ret) - goto corrected_error; } } - /* - * for dev_replace, pick good pages and write to the target device. - */ - if (sctx->is_dev_replace) { - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - int sub_success; - - sub_success = 0; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = - sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = - sblock_other->pagev[page_num]; - - if (!page_other->io_error) { - ret = scrub_write_page_to_dev_replace( - sblock_other, page_num); - if (ret == 0) { - /* succeeded for this page */ - sub_success = 1; - break; - } else { - btrfs_dev_replace_stats_inc( - &sctx->dev_root-> - fs_info->dev_replace. - num_write_errors); - } - } - } - - if (!sub_success) { - /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request - */ - success = 0; - ret = scrub_write_page_to_dev_replace( - sblock_bad, page_num); - if (ret) - btrfs_dev_replace_stats_inc( - &sctx->dev_root->fs_info-> - dev_replace.num_write_errors); - } - } - - goto out; - } + if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) + goto did_not_correct_error; /* - * for regular scrub, repair those pages that are errored. * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from @@ -1184,44 +1151,64 @@ nodatasum_case: * mirror, even if other 512 byte sectors in the same PAGE_SIZE * area are unreadable. */ - - /* can only fix I/O errors from here on */ - if (sblock_bad->no_io_error_seen) - goto did_not_correct_error; - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_block *sblock_other = NULL; - if (!page_bad->io_error) + /* skip no-io-error page in scrub */ + if (!page_bad->io_error && !sctx->is_dev_replace) continue; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - struct scrub_page *page_other = sblock_other->pagev[ - page_num]; - - if (!page_other->io_error) { - ret = scrub_repair_page_from_good_copy( - sblock_bad, sblock_other, page_num, 0); - if (0 == ret) { - page_bad->io_error = 0; - break; /* succeeded for this page */ + /* try to find no-io-error page in mirrors */ + if (page_bad->io_error) { + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (!sblocks_for_recheck[mirror_index]. + pagev[page_num]->io_error) { + sblock_other = sblocks_for_recheck + + mirror_index; + break; } } + if (!sblock_other) + success = 0; } - if (page_bad->io_error) { - /* did not find a mirror to copy the page from */ - success = 0; + if (sctx->is_dev_replace) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + if (!sblock_other) + sblock_other = sblock_bad; + + if (scrub_write_page_to_dev_replace(sblock_other, + page_num) != 0) { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + success = 0; + } + } else if (sblock_other) { + ret = scrub_repair_page_from_good_copy(sblock_bad, + sblock_other, + page_num, 0); + if (0 == ret) + page_bad->io_error = 0; + else + success = 0; } } - if (success) { + if (success && !sctx->is_dev_replace) { if (is_metadata || have_csum) { /* * need to verify the checksum now that all @@ -1288,19 +1275,18 @@ out: return 0; } -static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map) +static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) { - if (raid_map) { - if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) - return 3; - else - return 2; - } else { + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + return 3; + else return (int)bbio->num_stripes; - } } -static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, +static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, + u64 *raid_map, u64 mapped_length, int nstripes, int mirror, int *stripe_index, @@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, { int i; - if (raid_map) { + if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { /* RAID5/6 */ for (i = 0; i < nstripes; i++) { if (raid_map[i] == RAID6_Q_STRIPE || @@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, } } -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck) { + struct scrub_ctx *sctx = original_sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = original_sblock->page_count * PAGE_SIZE; + u64 logical = original_sblock->pagev[0]->logical; struct scrub_recover *recover; struct btrfs_bio *bbio; - u64 *raid_map; u64 sublen; u64 mapped_length; u64 stripe_offset; int stripe_index; - int page_index; + int page_index = 0; int mirror_index; int nmirrors; int ret; /* - * note: the two members ref_count and outstanding_pages + * note: the two members refs and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ - page_index = 0; while (length > 0) { sublen = min_t(u64, length, PAGE_SIZE); mapped_length = sublen; bbio = NULL; - raid_map = NULL; /* * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, - &mapped_length, &bbio, 0, &raid_map); + &mapped_length, &bbio, 0, 1); if (ret || !bbio || mapped_length < sublen) { - kfree(bbio); - kfree(raid_map); + btrfs_put_bbio(bbio); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { - kfree(bbio); - kfree(raid_map); + btrfs_put_bbio(bbio); return -ENOMEM; } atomic_set(&recover->refs, 1); recover->bbio = bbio; - recover->raid_map = raid_map; recover->map_length = mapped_length; BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); - nmirrors = scrub_nr_raid_mirrors(bbio, raid_map); + nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; struct scrub_page *page; - if (mirror_index >= BTRFS_MAX_MIRRORS) - continue; - sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; page = kzalloc(sizeof(*page), GFP_NOFS); @@ -1410,9 +1389,12 @@ leave_nomem: sblock->pagev[page_index] = page; page->logical = logical; - scrub_stripe_index_and_offset(logical, raid_map, + scrub_stripe_index_and_offset(logical, + bbio->map_type, + bbio->raid_map, mapped_length, - bbio->num_stripes, + bbio->num_stripes - + bbio->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); @@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error) static inline int scrub_is_page_on_raid56(struct scrub_page *page) { - return page->recover && page->recover->raid_map; + return page->recover && + (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, @@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, bio->bi_end_io = scrub_bio_wait_endio; ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, - page->recover->raid_map, page->recover->map_length, page->mirror_num, 0); if (ret) @@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, } static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write) + struct scrub_block *sblock_good) { int page_num; int ret = 0; @@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, ret_sub = scrub_repair_page_from_good_copy(sblock_bad, sblock_good, - page_num, - force_write); + page_num, 1); if (ret_sub) ret = ret_sub; } @@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->ref_count); + atomic_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->ref_count)) { + if (atomic_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock) static void scrub_page_get(struct scrub_page *spage) { - atomic_inc(&spage->ref_count); + atomic_inc(&spage->refs); } static void scrub_page_put(struct scrub_page *spage) { - if (atomic_dec_and_test(&spage->ref_count)) { + if (atomic_dec_and_test(&spage->refs)) { if (spage->page) __free_page(spage->page); kfree(spage); @@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->ref_count, 1); + atomic_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->ref_count, 1); + atomic_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; @@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_raid_bio *rbio; struct scrub_page *spage; struct btrfs_bio *bbio = NULL; - u64 *raid_map = NULL; u64 length; int ret; @@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) length = sparity->logic_end - sparity->logic_start + 1; ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, sparity->logic_start, - &length, &bbio, 0, &raid_map); - if (ret || !bbio || !raid_map) + &length, &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) goto bbio_out; bio = btrfs_io_bio_alloc(GFP_NOFS, 0); @@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_end_io = scrub_parity_bio_endio; rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, - raid_map, length, - sparity->scrub_dev, + length, sparity->scrub_dev, sparity->dbitmap, sparity->nsectors); if (!rbio) @@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: - kfree(bbio); - kfree(raid_map); + btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->ref_count); + atomic_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->ref_count)) + if (!atomic_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->ref_count, 1); + atomic_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; @@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); mirror_num = 1; @@ -3074,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ logical = base + offset; physical_end = physical + nstripes * map->stripe_len; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical_end, num, map, &logic_end, NULL); logic_end += base; @@ -3121,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret = 0; while (physical < physical_end) { /* for raid56, we skip parity stripe */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = get_raid56_logic_offset(physical, num, map, &logical, &stripe_logical); logical += base; @@ -3280,8 +3254,7 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { /* * loop until we find next data stripe * or we have finished all stripes. @@ -3775,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); - scrub_free_ctx(sctx); + scrub_put_ctx(sctx); return ret; } @@ -3881,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < extent_len || !bbio->stripes[0].dev->bdev) { - kfree(bbio); + btrfs_put_bbio(bbio); return; } *extent_physical = bbio->stripes[0].physical; *extent_mirror_num = bbio->mirror_num; *extent_dev = bbio->stripes[0].dev; - kfree(bbio); + btrfs_put_bbio(bbio); } static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 804432dbc351..fe5857223515 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, - btrfs_inode_atime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, - btrfs_inode_mtime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, - btrfs_inode_ctime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); /* TODO Add otime support when the otime patches get into upstream */ ret = send_cmd(sctx); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6f49b2872a64..05fef198ff94 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1958,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } -static int btrfs_unfreeze(struct super_block *sb) -{ - return 0; -} - static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2011,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, - .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 92db3f648df4..94edb0a2a026 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -733,10 +733,18 @@ int btrfs_init_sysfs(void) ret = btrfs_init_debugfs(); if (ret) - return ret; + goto out1; init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) + goto out2; + + return 0; +out2: + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +out1: + kset_unregister(btrfs_kset); return ret; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index cc286ce97d1e..f51963a8f929 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -53,7 +53,7 @@ static int test_btrfs_split_item(void) return -ENOMEM; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7e99c2f98dd0..9e9f2368177d 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -258,8 +258,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, - (unsigned long)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); out: if (locked_page) page_cache_release(locked_page); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ae0f5b8bb80..a116b55ce788 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -843,7 +843,7 @@ static int test_hole_first(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ec3dcb202357..73f299ebdabb 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -404,12 +404,22 @@ int btrfs_test_qgroups(void) ret = -ENOMEM; goto out; } + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to add the root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; /* * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -448,17 +458,6 @@ int btrfs_test_qgroups(void) goto out; } - /* We are using this root as our extent root */ - root->fs_info->extent_root = root; - - /* - * Some of the paths we test assume we have a filled out fs_info, so we - * just need to addt he root in there so we don't panic. - */ - root->fs_info->tree_root = root; - root->fs_info->quota_root = root; - root->fs_info->quota_enabled = 1; - test_msg("Running qgroup tests\n"); ret = test_no_shared_qgroup(root); if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e88b59d13439..7e80f32550a6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -220,6 +220,7 @@ loop: * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); + cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.href_root = RB_ROOT; @@ -248,6 +249,8 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); + INIT_LIST_HEAD(&cur_trans->dirty_bgs); + spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; + bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID); old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); @@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item)) + old_root_used == btrfs_root_used(&root->root_item) && + (!extent_root || + list_empty(&trans->transaction->dirty_bgs))) break; btrfs_set_root_node(&root->root_item, root->node); @@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - ret = btrfs_write_dirty_block_groups(trans, root); + if (extent_root) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + } + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; } @@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); @@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + clear_bit(BTRFS_ROOT_DIRTY, &root->state); if (root != fs_info->extent_root) list_add_tail(&root->dirty_list, @@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); + ASSERT(list_empty(&cur_trans->dirty_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); @@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); + if (cur_trans->have_free_bgs) + btrfs_clear_space_info_full(root->fs_info); + root->fs_info->last_trans_committed = cur_trans->transid; /* * We needn't acquire the lock here because there is no other task diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 00ed29c4b3f9..937050a2b68e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,11 @@ struct btrfs_transaction { atomic_t num_writers; atomic_t use_count; + /* + * true if there is free bgs operations in this transaction + */ + int have_free_bgs; + /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; struct list_head list; @@ -58,6 +63,8 @@ struct btrfs_transaction { struct list_head pending_chunks; struct list_head pending_ordered; struct list_head switch_commits; + struct list_head dirty_bgs; + spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1a9585d4380a..9a37f8b39bae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, insert: btrfs_release_path(path); /* try to insert the key into the destination tree */ + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, key, item_size); + path->skip_release_on_error = 0; /* make sure any existing item is the correct size */ - if (ret == -EEXIST) { + if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -488,8 +490,20 @@ insert: src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) + if (btrfs_inode_generation(eb, src_item) == 0) { + struct extent_buffer *dst_eb = path->nodes[0]; + + if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + struct btrfs_map_token token; + u64 ino_size = btrfs_inode_size(eb, src_item); + + btrfs_init_map_token(&token); + btrfs_set_token_inode_size(dst_eb, dst_item, + ino_size, &token); + } goto no_copy; + } if (overwrite_root && S_ISDIR(btrfs_inode_mode(eb, src_item)) && @@ -844,7 +858,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - char *name, int namelen) + const char *name, int namelen) { struct btrfs_path *path; struct btrfs_inode_ref *ref; @@ -1254,13 +1268,14 @@ out: } static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) + struct btrfs_root *root, u64 ino) { int ret; - ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, - offset, BTRFS_ORPHAN_ITEM_KEY, NULL); - if (ret > 0) - ret = btrfs_insert_orphan_item(trans, root, offset); + + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; + return ret; } @@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root, leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *) (ptr + cur_offset); @@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root, } btrfs_release_path(path); - if (ret < 0) + if (ret < 0 && ret != -ENOENT) return ret; return nlink; } @@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, nlink = ret; ret = count_inode_extrefs(root, inode, path); - if (ret == -ENOENT) - ret = 0; - if (ret < 0) goto out; @@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } /* + * Return true if an inode reference exists in the log for the given name, + * inode and parent inode. + */ +static bool name_in_log_ref(struct btrfs_root *log_root, + const char *name, const int name_len, + const u64 dirid, const u64 ino) +{ + struct btrfs_key search_key; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = dirid; + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(dirid, name, name_len); + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + return false; +} + +/* * take a single entry in a log directory item and replay it into * the subvolume. * @@ -1666,10 +1703,17 @@ out: return ret; insert: + if (name_in_log_ref(root->log_root, name, name_len, + key->objectid, log_key.objectid)) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT && ret != -EEXIST) goto out; update_size = false; ret = 0; @@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; @@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); if (atomic_read(&root->log_writers)) schedule(); - mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); + mutex_lock(&root->log_mutex); } } @@ -3219,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only) + struct inode *inode, int log_inode_only, + u64 logged_isize) { struct btrfs_map_token token; @@ -3232,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * to say 'update this inode with these values' */ btrfs_set_token_inode_generation(leaf, item, 0, &token); - btrfs_set_token_inode_size(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, logged_isize, &token); } else { btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -3245,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), @@ -3284,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); btrfs_release_path(path); return 0; } @@ -3293,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, u64 *last_extent, - int start_slot, int nr, int inode_only) + int start_slot, int nr, int inode_only, + u64 logged_isize) { unsigned long src_offset; unsigned long dst_offset; @@ -3350,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, dst_path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, - inode, inode_only == LOG_INODE_EXISTS); + inode, inode_only == LOG_INODE_EXISTS, + logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, src_offset, ins_sizes[i]); @@ -3902,6 +3949,33 @@ process: return ret; } +static int logged_inode_size(struct btrfs_root *log, struct inode *inode, + struct btrfs_path *path, u64 *size_ret) +{ + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *size_ret = i_size_read(inode); + } else { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + } + + btrfs_release_path(path); + return 0; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -3939,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 logged_isize = 0; path = btrfs_alloc_path(); if (!path) @@ -3966,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* Only run delayed items if we are a dir or a new file */ + /* + * Only run delayed items if we are a dir or a new file. + * Otherwise commit the delayed inode only, which is needed in + * order for the log replay code to mark inodes for link count + * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + */ if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; - } + else + ret = btrfs_commit_inode_delayed_inode(inode); + + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; } mutex_lock(&BTRFS_I(inode)->log_mutex); @@ -3988,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; + if (inode_only == LOG_INODE_EXISTS) { + max_key_type = BTRFS_INODE_EXTREF_KEY; + max_key.type = max_key_type; + } ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { - if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); - } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + if (inode_only == LOG_INODE_EXISTS) { + /* + * Make sure the new inode item we write to the log has + * the same isize as the current one (if it exists). + * This is necessary to prevent data loss after log + * replay, and also to prevent doing a wrong expanding + * truncate - for e.g. create file, write 4K into offset + * 0, fsync, write 4K into offset 4096, add hard link, + * fsync some other file (to sync log), power fail - if + * we use the inode's current i_size, after log replay + * we get a 8Kb file, with the last 4Kb extent as a hole + * (zeroes), as if an expanding truncate happened, + * instead of getting a file of 4Kb only. + */ + err = logged_inode_size(log, inode, path, + &logged_isize); + if (err) + goto out_unlock; + } + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + if (inode_only == LOG_INODE_EXISTS) { + max_key.type = BTRFS_INODE_EXTREF_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } + } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) + if (inode_only == LOG_INODE_ALL) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; + } else { + max_key.type = BTRFS_INODE_EXTREF_KEY; + } ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4047,7 +4163,8 @@ again: } ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4071,7 +4188,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, ins_start_slot, - ins_nr, inode_only); + ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4092,7 +4209,8 @@ next_slot: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4273,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; + const struct dentry * const first_parent = parent; + const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > + last_committed); sb = inode->i_sb; @@ -4328,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - inode_only = LOG_INODE_EXISTS; while (1) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; @@ -4337,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; + /* + * On unlink we must make sure our immediate parent directory + * inode is fully logged. This is to prevent leaving dangling + * directory index entries and a wrong directory inode's i_size. + * Not doing so can result in a directory being impossible to + * delete after log replay (rmdir will always fail with error + * -ENOTEMPTY). + */ + if (did_unlink && parent == first_parent) + inode_only = LOG_INODE_ALL; + else + inode_only = LOG_INODE_EXISTS; + if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { + root->fs_info->last_trans_committed || + inode_only == LOG_INODE_ALL) { ret = btrfs_log_inode(trans, root, inode, inode_only, 0, LLONG_MAX, ctx); if (ret) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 50c5a8762aed..cd4d1315aaa9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1310,6 +1310,8 @@ again: if (ret) { btrfs_error(root->fs_info, ret, "Failed to remove dev extent item"); + } else { + trans->transaction->have_free_bgs = 1; } out: btrfs_free_path(path); @@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { - if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) return; btrfs_set_fs_incompat(info, RAID56); @@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); - } free_extent_map(em); return len; } @@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); return ret; @@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b) } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { struct btrfs_bio_stripe s; - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; int i; u64 l; int again = 1; - int m; while (again) { again = 0; - for (i = 0; i < real_stripes - 1; i++) { - if (parity_smaller(raid_map[i], raid_map[i+1])) { + for (i = 0; i < num_stripes - 1; i++) { + if (parity_smaller(bbio->raid_map[i], + bbio->raid_map[i+1])) { s = bbio->stripes[i]; - l = raid_map[i]; + l = bbio->raid_map[i]; bbio->stripes[i] = bbio->stripes[i+1]; - raid_map[i] = raid_map[i+1]; + bbio->raid_map[i] = bbio->raid_map[i+1]; bbio->stripes[i+1] = s; - raid_map[i+1] = l; - - if (bbio->tgtdev_map) { - m = bbio->tgtdev_map[i]; - bbio->tgtdev_map[i] = - bbio->tgtdev_map[i + 1]; - bbio->tgtdev_map[i + 1] = m; - } + bbio->raid_map[i+1] = l; again = 1; } @@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) } } +static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +{ + struct btrfs_bio *bbio = kzalloc( + sizeof(struct btrfs_bio) + + sizeof(struct btrfs_bio_stripe) * (total_stripes) + + sizeof(int) * (real_stripes) + + sizeof(u64) * (real_stripes), + GFP_NOFS); + if (!bbio) + return NULL; + + atomic_set(&bbio->error, 0); + atomic_set(&bbio->refs, 1); + + return bbio; +} + +void btrfs_get_bbio(struct btrfs_bio *bbio) +{ + WARN_ON(!atomic_read(&bbio->refs)); + atomic_inc(&bbio->refs); +} + +void btrfs_put_bbio(struct btrfs_bio *bbio) +{ + if (!bbio) + return; + if (atomic_dec_and_test(&bbio->refs)) + kfree(bbio); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, - int mirror_num, u64 **raid_map_ret) + int mirror_num, int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - u64 *raid_map = NULL; int stripe_index; int i; int ret = 0; @@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_offset = offset - stripe_offset; /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); raid56_full_stripe_start = offset; @@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) { /* we don't discard raid56 yet */ - if (map->type & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out; } @@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && (rw & REQ_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); @@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 physical_of_found = 0; ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, NULL); + logical, &tmp_length, &tmp_bbio, 0, 0); if (ret) { WARN_ON(tmp_bbio != NULL); goto out; @@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * is not left of the left cursor */ ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } @@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else { WARN_ON(1); ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, mirror_num = stripe_index - old_stripe_index + 1; } - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - u64 tmp; - - if (raid_map_ret && + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + if (need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { - int i, rot; - /* push stripe_nr back to the start of the full stripe */ stripe_nr = raid56_full_stripe_start; do_div(stripe_nr, stripe_len * nr_data_stripes(map)); @@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = map->num_stripes; max_errors = nr_parity_stripes(map); - raid_map = kmalloc_array(num_stripes, sizeof(u64), - GFP_NOFS); - if (!raid_map) { - ret = -ENOMEM; - goto out; - } - - /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); - - /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) - raid_map[(i+rot) % num_stripes] = - em->start + (tmp + i) * map->stripe_len; - - raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) - raid_map[(i+rot+1) % num_stripes] = - RAID6_Q_STRIPE; - *length = map->stripe_len; stripe_index = 0; stripe_offset = 0; } else { + u64 tmp; + /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. @@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, tgtdev_indexes = num_stripes; } - bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), - GFP_NOFS); + bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); if (!bbio) { - kfree(raid_map); ret = -ENOMEM; goto out; } - atomic_set(&bbio->error, 0); if (dev_replace_is_ongoing) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + /* build raid_map */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && + need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { + u64 tmp; + int i, rot; + + bbio->raid_map = (u64 *)((void *)bbio->stripes + + sizeof(struct btrfs_bio_stripe) * + num_alloc_stripes + + sizeof(int) * tgtdev_indexes); + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + bbio->raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + bbio->raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + } + if (rw & REQ_DISCARD) { int factor = 0; int sub_stripes = 0; @@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) max_errors = btrfs_chunk_max_errors(map); + if (bbio->raid_map) + sort_parity_stripes(bbio, num_stripes); + tgtdev_indexes = 0; if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && dev_replace->tgtdev != NULL) { @@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } *bbio_ret = bbio; + bbio->map_type = map->type; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; @@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } - if (raid_map) { - sort_parity_stripes(bbio, raid_map); - *raid_map_ret = raid_map; - } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); @@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_bio **bbio_ret, int mirror_num) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num, NULL); + mirror_num, 0); } /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, - u64 **raid_map_ret) + int need_raid_map) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num, raid_map_ret); + mirror_num, need_raid_map); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { do_div(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } @@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e bio_endio_nodec(bio, err); else bio_endio(bio, err); - kfree(bbio); + btrfs_put_bbio(bbio); } static void btrfs_end_bio(struct bio *bio, int err) @@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; - u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; @@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, btrfs_bio_counter_inc_blocked(root->fs_info); ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, &raid_map); + mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (raid_map) { + if (bbio->raid_map) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { - ret = raid56_parity_write(root, bio, bbio, - raid_map, map_length); + ret = raid56_parity_write(root, bio, bbio, map_length); } else { - ret = raid56_parity_recover(root, bio, bbio, - raid_map, map_length, + ret = raid56_parity_recover(root, bio, bbio, map_length, mirror_num, 1); } @@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root) struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; - u8 *ptr; - unsigned long sb_ptr; + u8 *array_ptr; + unsigned long sb_array_offset; int ret = 0; u32 num_stripes; u32 array_size; u32 len = 0; - u32 cur; + u32 cur_offset; struct btrfs_key key; - sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, - BTRFS_SUPER_INFO_SIZE); + ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); + /* + * This will create extent buffer of nodesize, superblock size is + * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will + * overallocate but we can keep it as-is, only the first page is used. + */ + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); @@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root) write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); - ptr = super_copy->sys_chunk_array; - sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); - cur = 0; + array_ptr = super_copy->sys_chunk_array; + sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); + cur_offset = 0; + + while (cur_offset < array_size) { + disk_key = (struct btrfs_disk_key *)array_ptr; + len = sizeof(*disk_key); + if (cur_offset + len > array_size) + goto out_short_read; - while (cur < array_size) { - disk_key = (struct btrfs_disk_key *)ptr; btrfs_disk_key_to_cpu(&key, disk_key); - len = sizeof(*disk_key); ptr += len; - sb_ptr += len; - cur += len; + array_ptr += len; + sb_array_offset += len; + cur_offset += len; if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_ptr; + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be + * present, exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; + + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + ret = read_one_chunk(root, &key, sb, chunk); if (ret) break; - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - len = btrfs_chunk_item_size(num_stripes); } else { ret = -EIO; break; } - ptr += len; - sb_ptr += len; - cur += len; + array_ptr += len; + sb_array_offset += len; + cur_offset += len; } free_extent_buffer(sb); return ret; + +out_short_read: + printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", + len, cur_offset); + free_extent_buffer(sb); + return -EIO; } int btrfs_read_chunk_tree(struct btrfs_root *root) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d6fe73c0f4a2..83069dec6898 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); #define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) struct btrfs_bio { + atomic_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; + u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; unsigned long flags; @@ -307,6 +309,12 @@ struct btrfs_bio { int mirror_num; int num_tgtdevs; int *tgtdev_map; + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; struct btrfs_bio_stripe stripes[]; }; @@ -388,19 +396,15 @@ struct btrfs_balance_control { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); - -#define btrfs_bio_size(total_stripes, real_stripes) \ - (sizeof(struct btrfs_bio) + \ - (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \ - (sizeof(int) * (real_stripes))) - +void btrfs_get_bbio(struct btrfs_bio *bbio); +void btrfs_put_bbio(struct btrfs_bio *bbio); int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, - u64 **raid_map_ret); + int need_raid_map); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); |