From aeea4c10a88c9ec405072cecb18939b0a9c876f6 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Thu, 2 Mar 2017 22:42:38 +0300 Subject: MAINTAINERS: add btrfs file entries for include directories Add file entries for btrfs header files. Signed-off-by: Dmitry V. Levin Acked-by: David Sterba Signed-off-by: David Sterba --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 676c139bc883..f3dae93a3c90 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2904,6 +2904,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs.git S: Maintained F: Documentation/filesystems/btrfs.txt F: fs/btrfs/ +F: include/linux/btrfs* +F: include/uapi/linux/btrfs* BTTV VIDEO4LINUX DRIVER M: Mauro Carvalho Chehab -- cgit v1.2.3 From 14506127979a5a3d0c5d9b4cc76ce9d4ec23b717 Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Tue, 7 Mar 2017 23:34:44 +0100 Subject: btrfs: fix a bogus warning when converting only data or metadata If your filesystem has, eg, data:raid0 metadata:raid1, and you run "btrfs balance -dconvert=raid1", the meta.target field will be uninitialized. That's otherwise ok, as it's unused except for this warning. Thus, let's use the existing set of raid levels for the comparison. As a side effect, non-convert balances will now nag about data>metadata. Signed-off-by: Adam Borowski Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ab8a66d852f9..02bbc4c8c755 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3755,6 +3755,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 meta_target, data_target; u64 allowed; int mixed = 0; int ret; @@ -3851,11 +3852,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < - btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < + btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { btrfs_warn(fs_info, "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - bctl->meta.target, bctl->data.target); + meta_target, data_target); } if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { -- cgit v1.2.3 From f95fda8751b7b243c810805d1deaae502716d8fb Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 2 Mar 2017 18:41:19 -0800 Subject: Btrfs: remove ASSERT in btrfs_truncate_inode_items After 76b42abbf748 ("Btrfs: fix data loss after truncate when using the no-holes feature"), For either NO_HOLES or inline extents, we've set last_size to newsize to avoid data loss after remount or inode got evicted and read again, thus, we don't need this check anymore. Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e71f1ea3391..568b28114f09 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4603,13 +4603,6 @@ error: btrfs_free_path(path); - if (err == 0) { - /* only inline file may have last_size != new_size */ - if (new_size >= fs_info->sectorsize || - new_size > fs_info->max_inline) - ASSERT(last_size == new_size); - } - if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { -- cgit v1.2.3 From 140475ae4ad10d140bb69572499b1ff87367e807 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:10 +0200 Subject: btrfs: convert btrfs_bio.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 ++++---- fs/btrfs/volumes.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 02bbc4c8c755..0f6706047167 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5301,22 +5301,22 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); - atomic_set(&bbio->refs, 1); + refcount_set(&bbio->refs, 1); return bbio; } void btrfs_get_bbio(struct btrfs_bio *bbio) { - WARN_ON(!atomic_read(&bbio->refs)); - atomic_inc(&bbio->refs); + WARN_ON(!refcount_read(&bbio->refs)); + refcount_inc(&bbio->refs); } void btrfs_put_bbio(struct btrfs_bio *bbio) { if (!bbio) return; - if (atomic_dec_and_test(&bbio->refs)) + if (refcount_dec_and_test(&bbio->refs)) kfree(bbio); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 59be81206dd7..ac0bf7d0df60 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -298,7 +298,7 @@ struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); struct btrfs_bio { - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ -- cgit v1.2.3 From 9b64f57ddf8673d29fafb3405d4aa1e93f5a4cd7 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:11 +0200 Subject: btrfs: convert btrfs_transaction.use_count from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/transaction.c | 20 ++++++++++---------- fs/btrfs/transaction.h | 4 +++- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb1ee7b6f532..e4aa64c263f9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4615,7 +4615,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_START) { - atomic_inc(&t->use_count); + refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index be5477676cc8..85e51f311b57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10850,7 +10850,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ret = find_free_dev_extent_start(trans, device, minlen, start, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9a46878ba60f..da5399f9fb54 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ASSERT(trans); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 61b807de3e16..a7d7a7d1d78a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(atomic_read(&transaction->use_count) == 0); - if (atomic_dec_and_test(&transaction->use_count)) { + WARN_ON(refcount_read(&transaction->use_count) == 0); + if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); if (transaction->delayed_refs.pending_csums) @@ -207,7 +207,7 @@ loop: spin_unlock(&fs_info->trans_lock); return -EBUSY; } - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); @@ -257,7 +257,7 @@ loop: * One for this trans handle, one so it will live on until we * commit the transaction. */ - atomic_set(&cur_trans->use_count, 2); + refcount_set(&cur_trans->use_count, 2); atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = get_seconds(); @@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->trans_lock); cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_event(fs_info->transaction_wait, @@ -744,7 +744,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = 0; break; } @@ -773,7 +773,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); break; } } @@ -1839,7 +1839,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, /* take transaction reference */ cur_trans = trans->transaction; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); btrfs_end_transaction(trans); @@ -2015,7 +2015,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&fs_info->trans_lock); - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans); @@ -2035,7 +2035,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state != TRANS_STATE_COMPLETED) { - atomic_inc(&prev_trans->use_count); + refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5dfb5590fff6..902619f83db6 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -18,6 +18,8 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ + +#include #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" @@ -49,7 +51,7 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; - atomic_t use_count; + refcount_t use_count; atomic_t pending_ordered; unsigned long flags; -- cgit v1.2.3 From 490b54d6fb75f6ffd0471ec58bb38a992e2b40cd Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:12 +0200 Subject: btrfs: convert extent_map.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_map.c | 10 +++++----- fs/btrfs/extent_map.h | 3 ++- fs/btrfs/tree-log.c | 2 +- fs/btrfs/volumes.c | 2 +- include/trace/events/btrfs.h | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 27fdb250b446..3649932e48d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2859,7 +2859,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } @@ -2870,7 +2870,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); - atomic_inc(&em->refs); + refcount_inc(&em->refs); *em_cached = em; } return em; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 26f9ac719d20..69850155870c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void) em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; - atomic_set(&em->refs, 1); + refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; } @@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(atomic_read(&em->refs) == 0); - if (atomic_dec_and_test(&em->refs)) { + WARN_ON(refcount_read(&em->refs) == 0); + if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) @@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); em->mod_start = em->start; em->mod_len = em->len; @@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, if (strict && !(end > em->start && start < extent_map_end(em))) return NULL; - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index eb8b8fae036b..a67b2def5413 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -2,6 +2,7 @@ #define __EXTENTMAP__ #include +#include #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) @@ -41,7 +42,7 @@ struct extent_map { */ struct map_lookup *map_lookup; }; - atomic_t refs; + refcount_t refs; unsigned int compress_type; struct list_head list; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a59674c3e69e..ccfe9fe7754a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4196,7 +4196,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ - atomic_inc(&em->refs); + refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); num++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0f6706047167..dce59fb59b0c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4839,7 +4839,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = add_extent_mapping(em_tree, em, 0); if (!ret) { list_add_tail(&em->list, &trans->transaction->pending_chunks); - atomic_inc(&em->refs); + refcount_inc(&em->refs); } write_unlock(&em_tree->lock); if (ret) { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index a3c3cab643a9..9dd29e806fed 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -213,7 +213,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->block_start = map->block_start; __entry->block_len = map->block_len; __entry->flags = map->flags; - __entry->refs = atomic_read(&map->refs); + __entry->refs = refcount_read(&map->refs); __entry->compress_type = map->compress_type; ), -- cgit v1.2.3 From e76edab7f059bc1047c1865141e2709d70e74852 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:13 +0200 Subject: btrfs: convert btrfs_ordered_extent.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 18 +++++++++--------- fs/btrfs/ordered-data.h | 2 +- include/trace/events/btrfs.h | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index da5399f9fb54..7b40e2e7292a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); /* one ref for the tree */ - atomic_set(&entry->refs, 1); + refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); @@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -425,7 +425,7 @@ have_entry: out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode, if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } @@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); - if (atomic_dec_and_test(&entry->refs)) { + if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->log_list)); ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); @@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, list_move_tail(&ordered->root_extent_list, &root->ordered_extents); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, @@ -870,7 +870,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (!offset_in_entry(entry, file_offset)) entry = NULL; if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; @@ -911,7 +911,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( } out: if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); spin_unlock_irq(&tree->lock); return entry; } @@ -948,7 +948,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 195c93b67fe0..e0c1d5b8d859 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -113,7 +113,7 @@ struct btrfs_ordered_extent { int compress_type; /* reference count */ - atomic_t refs; + refcount_t refs; /* the inode we belong to */ struct inode *inode; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 9dd29e806fed..8f206263fee7 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -275,7 +275,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __entry->bytes_left = ordered->bytes_left; __entry->flags = ordered->flags; __entry->compress_type = ordered->compress_type; - __entry->refs = atomic_read(&ordered->refs); + __entry->refs = refcount_read(&ordered->refs); __entry->root_objectid = BTRFS_I(inode)->root->root_key.objectid; __entry->truncated_len = ordered->truncated_len; -- cgit v1.2.3 From 1e4f4714d59e3fdcde29ee12d40d2e96875a026f Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:14 +0200 Subject: btrfs: convert btrfs_caching_control.count from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c4115901d906..cdfc2a46448b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -39,6 +39,7 @@ #include #include #include +#include #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -518,7 +519,7 @@ struct btrfs_caching_control { struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; - atomic_t count; + refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 85e51f311b57..5c84eea60703 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -316,14 +316,14 @@ get_caching_control(struct btrfs_block_group_cache *cache) } ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); spin_unlock(&cache->lock); return ctl; } static void put_caching_control(struct btrfs_caching_control *ctl) { - if (atomic_dec_and_test(&ctl->count)) + if (refcount_dec_and_test(&ctl->count)) kfree(ctl); } @@ -599,7 +599,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; - atomic_set(&caching_ctl->count, 1); + refcount_set(&caching_ctl->count, 1); btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, caching_thread, NULL, NULL); @@ -620,7 +620,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_caching_control *ctl; ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&cache->lock); @@ -707,7 +707,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } down_write(&fs_info->commit_root_sem); - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->commit_root_sem); @@ -10416,7 +10416,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->caching_block_groups, list) if (ctl->block_group == block_group) { caching_ctl = ctl; - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); break; } } -- cgit v1.2.3 From 6df8cdf5bda221f268ac23940bce589ad176993d Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:15 +0200 Subject: btrfs: convert btrfs_delayed_ref_node.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/backref.c | 2 +- fs/btrfs/delayed-ref.c | 8 ++++---- fs/btrfs/delayed-ref.h | 8 +++++--- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 6 +++--- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7699e16784d3..116338344224 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1286,7 +1286,7 @@ again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6eb80952efb3..be70d90dfee5 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -590,7 +590,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = count_mod; @@ -682,7 +682,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -739,7 +739,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, seq = atomic64_read(&fs_info->tree_mod_seq); /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 0e537f98f1a1..c0264ff01b53 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,6 +18,8 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ +#include + /* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ @@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node { u64 seq; /* ref count on this data structure */ - atomic_t refs; + refcount_t refs; /* * how many refs is this entry adding or deleting. For @@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(atomic_read(&ref->refs) == 0); - if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(refcount_read(&ref->refs) == 0); + if (refcount_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e4aa64c263f9..3748bc54a6ab 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4343,7 +4343,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5c84eea60703..5a7ddcff406b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -892,7 +892,7 @@ search_again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -2980,7 +2980,7 @@ again: struct btrfs_delayed_ref_node *ref; ref = &head->node; - atomic_inc(&ref->refs); + refcount_inc(&ref->refs); spin_unlock(&delayed_refs->lock); /* @@ -3057,7 +3057,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); -- cgit v1.2.3 From 6de5f18e7b0da0cdd265eda047a0bc4f48260bcb Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:16 +0200 Subject: btrfs: convert btrfs_delayed_node.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 28 ++++++++++++++-------------- fs/btrfs/delayed-inode.h | 4 ++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1aff676f0e5b..7396c36e0adb 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node( { delayed_node->root = root; delayed_node->inode_id = inode_id; - atomic_set(&delayed_node->refs, 0); + refcount_set(&delayed_node->refs, 0); delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); + refcount_inc(&node->refs); return node; } @@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { - atomic_inc(&node->refs); /* can be accessed */ + refcount_inc(&node->refs); /* can be accessed */ BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); return node; } btrfs_inode->delayed_node = node; /* can be accessed and cached in the inode */ - atomic_add(2, &node->refs); + refcount_add(2, &node->refs); spin_unlock(&root->inode_lock); return node; } @@ -125,7 +125,7 @@ again: btrfs_init_delayed_node(node, root, ino); /* cached in the btrfs inode and can be accessed */ - atomic_add(2, &node->refs); + refcount_set(&node->refs, 2); ret = radix_tree_preload(GFP_NOFS); if (ret) { @@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, } else { list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); - atomic_inc(&node->refs); /* inserted into list */ + refcount_inc(&node->refs); /* inserted into list */ root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; - atomic_dec(&node->refs); /* not in the list */ + refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); @@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node( p = delayed_root->node_list.next; node = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( p = node->n_list.next; next = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); out: spin_unlock(&delayed_root->lock); @@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); - if (atomic_dec_and_test(&delayed_node->refs)) { + if (refcount_dec_and_test(&delayed_node->refs)) { bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); - if (atomic_read(&delayed_node->refs) == 0) { + if (refcount_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); free = true; @@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( p = delayed_root->prepare_list.next; list_del_init(p); node = list_entry(p, struct btrfs_delayed_node, p_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ - atomic_dec(&delayed_node->refs); + refcount_dec(&delayed_node->refs); return true; } @@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) inode_id = delayed_nodes[n - 1]->inode_id + 1; for (i = 0; i < n; i++) - atomic_inc(&delayed_nodes[i]->refs); + refcount_inc(&delayed_nodes[i]->refs); spin_unlock(&root->inode_lock); for (i = 0; i < n; i++) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 40327cc3b99a..2f494fdde0ac 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -26,7 +26,7 @@ #include #include #include - +#include #include "ctree.h" /* types of the delayed item */ @@ -67,7 +67,7 @@ struct btrfs_delayed_node { struct rb_root del_root; struct mutex mutex; struct btrfs_inode_item inode_item; - atomic_t refs; + refcount_t refs; u64 index_cnt; unsigned long flags; int count; -- cgit v1.2.3 From 089e77e10d7e44a007101cf7323dc7bda66172f4 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:17 +0200 Subject: btrfs: convert btrfs_delayed_item.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 18 +++++++++--------- fs/btrfs/delayed-inode.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 7396c36e0adb..8ae409b5a61d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->ins_or_del = 0; item->bytes_reserved = 0; item->delayed_node = NULL; - atomic_set(&item->refs, 1); + refcount_set(&item->refs, 1); } return item; } @@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) { if (item) { __btrfs_remove_delayed_item(item); - if (atomic_dec_and_test(&item->refs)) + if (refcount_dec_and_test(&item->refs)) kfree(item); } } @@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, ins_list); item = __btrfs_next_delayed_item(item); } item = __btrfs_first_delayed_deletion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, del_list); item = __btrfs_next_delayed_item(item); } @@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } list_for_each_entry_safe(curr, next, del_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } @@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, list_del(&curr->readdir_list); ret = (curr->key.offset == index); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (ret) @@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_del(&curr->readdir_list); if (curr->key.offset < ctx->pos) { - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } @@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, over = !dir_emit(ctx, name, name_len, location.objectid, d_type); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 2f494fdde0ac..c4189d495934 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -80,7 +80,7 @@ struct btrfs_delayed_item { struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; - atomic_t refs; + refcount_t refs; int ins_or_del; u32 data_len; char data[0]; -- cgit v1.2.3 From 0700cea7c8b387c8c6bc4de79b197baa0b3fc4a3 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:18 +0200 Subject: btrfs: convert btrfs_root.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/disk-io.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cdfc2a46448b..285566cc2f7d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1222,7 +1222,7 @@ struct btrfs_root { dev_t anon_dev; spinlock_t root_item_lock; - atomic_t refs; + refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3748bc54a6ab..bd415e1dd114 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1340,7 +1340,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - atomic_set(&root->refs, 1); + refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2e0ec29bfd69..21f1ceb85b76 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); */ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) { - if (atomic_inc_not_zero(&root->refs)) + if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } static inline void btrfs_put_fs_root(struct btrfs_root *root) { - if (atomic_dec_and_test(&root->refs)) + if (refcount_dec_and_test(&root->refs)) kfree(root); } -- cgit v1.2.3 From b7ac31b7b2ebd735b7b67c85711ef6d16648051a Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:19 +0200 Subject: btrfs: convert extent_state.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 +++++++------- fs/btrfs/extent_io.h | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3649932e48d5..174ffffd6d97 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void) pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), - atomic_read(&state->refs)); + refcount_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } @@ -238,7 +238,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); - atomic_set(&state->refs, 1); + refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); return state; @@ -248,7 +248,7 @@ void free_extent_state(struct extent_state *state) { if (!state) return; - if (atomic_dec_and_test(&state->refs)) { + if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); @@ -641,7 +641,7 @@ again: if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) { if (clear) - atomic_dec(&cached->refs); + refcount_dec(&cached->refs); state = cached; goto hit_next; } @@ -793,7 +793,7 @@ process_node: if (state->state & bits) { start = state->start; - atomic_inc(&state->refs); + refcount_inc(&state->refs); wait_on_state(tree, state); free_extent_state(state); goto again; @@ -834,7 +834,7 @@ static void cache_state_if_flags(struct extent_state *state, if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { *cached_ptr = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } } } @@ -1538,7 +1538,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, if (!found) { *start = state->start; *cached_state = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } found++; *end = state->end; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3e4fad4a909d..8d2d6e4272d5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include +#include #include "ulist.h" /* bits for the extent state */ @@ -143,7 +144,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; - atomic_t refs; + refcount_t refs; unsigned state; struct io_failure_record *failrec; -- cgit v1.2.3 From a50299ae7cc4ba7268749e144e7c16321c25e3b3 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:20 +0200 Subject: btrfs: convert compressed_bio.pending_bios from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/compression.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c7721a6aa3bb..10e6b282d09d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -44,7 +44,7 @@ struct compressed_bio { /* number of bios pending for this compressed extent */ - atomic_t pending_bios; + refcount_t pending_bios; /* the pages with the compressed data on them */ struct page **compressed_pages; @@ -161,7 +161,7 @@ static void end_compressed_bio_read(struct bio *bio) /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; inode = cb->inode; @@ -274,7 +274,7 @@ static void end_compressed_bio_write(struct bio *bio) /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; /* ok, we're the last bio for this extent, step one is to @@ -342,7 +342,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return -ENOMEM; - atomic_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->start = start; @@ -363,7 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); /* create and submit bios for the compressed pages */ bytes_left = compressed_len; @@ -388,7 +388,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -607,7 +607,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb) goto out; - atomic_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -656,7 +656,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { page = cb->compressed_pages[pg_index]; @@ -685,7 +685,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, -- cgit v1.2.3 From 6f615018b35fb5ea1ce49673079fc637a0b6be70 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:21 +0200 Subject: btrfs: convert scrub_recover.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b0251eb1239f..c9406bf302b8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -64,7 +64,7 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_recover { - atomic_t refs; + refcount_t refs; struct btrfs_bio *bbio; u64 map_length; }; @@ -857,12 +857,12 @@ out: static inline void scrub_get_recover(struct scrub_recover *recover) { - atomic_inc(&recover->refs); + refcount_inc(&recover->refs); } static inline void scrub_put_recover(struct scrub_recover *recover) { - if (atomic_dec_and_test(&recover->refs)) { + if (refcount_dec_and_test(&recover->refs)) { btrfs_put_bbio(recover->bbio); kfree(recover); } @@ -1343,7 +1343,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, return -ENOMEM; } - atomic_set(&recover->refs, 1); + refcount_set(&recover->refs, 1); recover->bbio = bbio; recover->map_length = mapped_length; -- cgit v1.2.3 From 186debd6ede29da9d51cffdb93cadee66e4e1e23 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:23 +0200 Subject: btrfs: convert scrub_block.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c9406bf302b8..d2c35848e61d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -112,7 +112,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t refs; /* free mem on transition to zero */ + refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -1998,12 +1998,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->refs); + refcount_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->refs)) { + if (refcount_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2255,7 +2255,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2555,7 +2555,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; -- cgit v1.2.3 From 78a764504d1e11411bec0e068c5d9e0a417aff08 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:24 +0200 Subject: btrfs: convert scrub_parity.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d2c35848e61d..8130ab11821f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -142,7 +142,7 @@ struct scrub_parity { int stripe_len; - atomic_t refs; + refcount_t refs; struct list_head spages; @@ -2822,12 +2822,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->refs); + refcount_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->refs)) + if (!refcount_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2879,7 +2879,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->refs, 1); + refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; -- cgit v1.2.3 From 99f4cdb16f80e1b30392b479d852036f87394b20 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:25 +0200 Subject: btrfs: convert scrub_ctx.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8130ab11821f..6ba0c9ddf777 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -202,7 +202,7 @@ struct scrub_ctx { * doesn't free the scrub context before or while the workers are * doing the wakeup() call. */ - atomic_t refs; + refcount_t refs; }; struct scrub_fixup_nodatasum { @@ -305,7 +305,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -356,7 +356,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -447,7 +447,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) static void scrub_put_ctx(struct scrub_ctx *sctx) { - if (atomic_dec_and_test(&sctx->refs)) + if (refcount_dec_and_test(&sctx->refs)) scrub_free_ctx(sctx); } @@ -462,7 +462,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; - atomic_set(&sctx->refs, 1); + refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; -- cgit v1.2.3 From dec95574f4e6545c701420b950278dc6f55d0368 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 3 Mar 2017 10:55:26 +0200 Subject: btrfs: convert btrfs_raid_bio.refs from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1571bf26dc07..a8954f5188b4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -149,7 +149,7 @@ struct btrfs_raid_bio { int generic_bio_cnt; - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; @@ -389,7 +389,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) { list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); BUG_ON(!list_empty(&rbio->plug_list)); } } @@ -480,7 +480,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) /* bump our ref if we were not in the list before */ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); if (!list_empty(&rbio->stripe_cache)){ list_move(&rbio->stripe_cache, &table->stripe_cache); @@ -689,7 +689,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) test_bit(RBIO_CACHE_BIT, &cur->flags) && !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { list_del_init(&cur->hash_list); - atomic_dec(&cur->refs); + refcount_dec(&cur->refs); steal_rbio(cur, rbio); cache_drop = cur; @@ -738,7 +738,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) } } lockit: - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock_irqrestore(&h->lock, flags); @@ -784,7 +784,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); /* * we use the plug list to hold all the rbios @@ -801,7 +801,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_del_init(&rbio->plug_list); list_add(&next->hash_list, &h->hash_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); @@ -843,8 +843,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; - WARN_ON(atomic_read(&rbio->refs) < 0); - if (!atomic_dec_and_test(&rbio->refs)) + if (!refcount_dec_and_test(&rbio->refs)) return; WARN_ON(!list_empty(&rbio->stripe_cache)); @@ -997,7 +996,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; - atomic_set(&rbio->refs, 1); + refcount_set(&rbio->refs, 1); atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); -- cgit v1.2.3 From 09ed2f165cb3449237dec842b3564044e12d22cb Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 10 Mar 2017 11:09:48 -0800 Subject: Btrfs: add file item tracepoints While debugging truncate problems, I found that these tracepoints could help us quickly know what went wrong. Two sets of tracepoints are created to track regular/prealloc file item and inline file item respectively, I put inline as a separate one since what inline file items cares about are way less than the regular one. This adds four tracepoints: - btrfs_get_extent_show_fi_regular - btrfs_get_extent_show_fi_inline - btrfs_truncate_show_fi_regular - btrfs_truncate_show_fi_inline Cc: David Sterba Signed-off-by: Liu Bo Reviewed-by: David Sterba [ formatting adjustments ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 15 +++++ include/trace/events/btrfs.h | 139 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 568b28114f09..da71119b5b7c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4401,9 +4401,17 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); + + trace_btrfs_truncate_show_fi_regular( + BTRFS_I(inode), leaf, fi, + found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, path->slots[0], fi); + + trace_btrfs_truncate_show_fi_inline( + BTRFS_I(inode), leaf, fi, path->slots[0], + found_key.offset); } item_end--; } @@ -6828,11 +6836,18 @@ again: found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); + + trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, + extent_start); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, fs_info->sectorsize); + + trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, + path->slots[0], + extent_start); } next: if (start >= extent_end) { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8f206263fee7..cef39e2baf21 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -12,6 +12,7 @@ struct btrfs_root; struct btrfs_fs_info; struct btrfs_inode; struct extent_map; +struct btrfs_file_extent_item; struct btrfs_ordered_extent; struct btrfs_delayed_ref_node; struct btrfs_delayed_tree_ref; @@ -54,6 +55,12 @@ struct btrfs_qgroup_extent_record; (obj >= BTRFS_ROOT_TREE_OBJECTID && \ obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-" +#define show_fi_type(type) \ + __print_symbolic(type, \ + { BTRFS_FILE_EXTENT_INLINE, "INLINE" }, \ + { BTRFS_FILE_EXTENT_REG, "REG" }, \ + { BTRFS_FILE_EXTENT_PREALLOC, "PREALLOC"}) + #define BTRFS_GROUP_FLAGS \ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ @@ -232,6 +239,138 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->refs, __entry->compress_type) ); +/* file extent item */ +DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, u64 start), + + TP_ARGS(bi, l, fi, start), + + TP_STRUCT__entry_btrfs( + __field( u64, root_obj ) + __field( u64, ino ) + __field( loff_t, isize ) + __field( u64, disk_isize ) + __field( u64, num_bytes ) + __field( u64, ram_bytes ) + __field( u64, disk_bytenr ) + __field( u64, disk_num_bytes ) + __field( u64, extent_offset ) + __field( u8, extent_type ) + __field( u8, compression ) + __field( u64, extent_start ) + __field( u64, extent_end ) + ), + + TP_fast_assign_btrfs(bi->root->fs_info, + __entry->root_obj = bi->root->objectid; + __entry->ino = btrfs_ino(bi); + __entry->isize = bi->vfs_inode.i_size; + __entry->disk_isize = bi->disk_i_size; + __entry->num_bytes = btrfs_file_extent_num_bytes(l, fi); + __entry->ram_bytes = btrfs_file_extent_ram_bytes(l, fi); + __entry->disk_bytenr = btrfs_file_extent_disk_bytenr(l, fi); + __entry->disk_num_bytes = btrfs_file_extent_disk_num_bytes(l, fi); + __entry->extent_offset = btrfs_file_extent_offset(l, fi); + __entry->extent_type = btrfs_file_extent_type(l, fi); + __entry->compression = btrfs_file_extent_compression(l, fi); + __entry->extent_start = start; + __entry->extent_end = (start + __entry->num_bytes); + ), + + TP_printk_btrfs( + "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu " + "file extent range=[%llu %llu] " + "(num_bytes=%llu ram_bytes=%llu disk_bytenr=%llu " + "disk_num_bytes=%llu extent_offset=%llu type=%s " + "compression=%u", + show_root_type(__entry->root_obj), __entry->ino, + __entry->isize, + __entry->disk_isize, __entry->extent_start, + __entry->extent_end, __entry->num_bytes, __entry->ram_bytes, + __entry->disk_bytenr, __entry->disk_num_bytes, + __entry->extent_offset, show_fi_type(__entry->extent_type), + __entry->compression) +); + +DECLARE_EVENT_CLASS( + btrfs__file_extent_item_inline, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, int slot, u64 start), + + TP_ARGS(bi, l, fi, slot, start), + + TP_STRUCT__entry_btrfs( + __field( u64, root_obj ) + __field( u64, ino ) + __field( loff_t, isize ) + __field( u64, disk_isize ) + __field( u8, extent_type ) + __field( u8, compression ) + __field( u64, extent_start ) + __field( u64, extent_end ) + ), + + TP_fast_assign_btrfs( + bi->root->fs_info, + __entry->root_obj = bi->root->objectid; + __entry->ino = btrfs_ino(bi); + __entry->isize = bi->vfs_inode.i_size; + __entry->disk_isize = bi->disk_i_size; + __entry->extent_type = btrfs_file_extent_type(l, fi); + __entry->compression = btrfs_file_extent_compression(l, fi); + __entry->extent_start = start; + __entry->extent_end = (start + btrfs_file_extent_inline_len(l, slot, fi)); + ), + + TP_printk_btrfs( + "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu " + "file extent range=[%llu %llu] " + "extent_type=%s compression=%u", + show_root_type(__entry->root_obj), __entry->ino, __entry->isize, + __entry->disk_isize, __entry->extent_start, + __entry->extent_end, show_fi_type(__entry->extent_type), + __entry->compression) +); + +DEFINE_EVENT( + btrfs__file_extent_item_regular, btrfs_get_extent_show_fi_regular, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, u64 start), + + TP_ARGS(bi, l, fi, start) +); + +DEFINE_EVENT( + btrfs__file_extent_item_regular, btrfs_truncate_show_fi_regular, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, u64 start), + + TP_ARGS(bi, l, fi, start) +); + +DEFINE_EVENT( + btrfs__file_extent_item_inline, btrfs_get_extent_show_fi_inline, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, int slot, u64 start), + + TP_ARGS(bi, l, fi, slot, start) +); + +DEFINE_EVENT( + btrfs__file_extent_item_inline, btrfs_truncate_show_fi_inline, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, int slot, u64 start), + + TP_ARGS(bi, l, fi, slot, start) +); + #define show_ordered_flags(flags) \ __print_flags(flags, "|", \ { (1 << BTRFS_ORDERED_IO_DONE), "IO_DONE" }, \ -- cgit v1.2.3 From 592d92eeab3770b2525d2fc5589b205c9f8c33e3 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Mar 2017 13:33:55 -0700 Subject: Btrfs: create a helper for getting chunk map We have similar code here and there, this merges them into a helper. Signed-off-by: Liu Bo Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 +- fs/btrfs/volumes.c | 163 +++++++++++++++++---------------------------------- fs/btrfs/volumes.h | 2 +- 3 files changed, 57 insertions(+), 111 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 174ffffd6d97..77bd016ba18b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2004,14 +2004,13 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); /* we can't repair anything in raid56 yet */ - if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) + if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) return 0; bio = btrfs_io_bio_alloc(GFP_NOFS, 1); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dce59fb59b0c..122201897bee 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2795,10 +2795,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, return ret; } +static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(fs_info, "unable to find logical %llu length %llu", + logical, length); + return ERR_PTR(-EINVAL); + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, + "found a bad mapping, wanted %llu-%llu, found %llu-%llu", + logical, length, em->start, em->start + em->len); + free_extent_map(em); + return ERR_PTR(-EINVAL); + } + + /* callers are responsible for dropping em's ref. */ + return em; +} + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; u64 dev_extent_len = 0; @@ -2806,23 +2834,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em_tree = &fs_info->mapping_tree.map_tree; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em || em->start > chunk_offset || - em->start + em->len < chunk_offset) { + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - if (em) - free_extent_map(em); - return -EINVAL; + return PTR_ERR(em); } map = em->map_lookup; mutex_lock(&fs_info->chunk_mutex); @@ -4894,7 +4914,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; size_t item_size; @@ -4903,24 +4922,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %Lu len %Lu", - chunk_offset, chunk_size); - return -EINVAL; - } - - if (em->start != chunk_offset || em->len != chunk_size) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu", - chunk_offset, chunk_size, em->start, em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); @@ -5061,15 +5065,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int readonly = 0; int miss_ndevs = 0; int i; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); - if (!em) + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) return 1; map = em->map_lookup; @@ -5123,34 +5124,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - - /* - * We could return errors for these cases, but that could get ugly and - * we'd probably do the same thing which is just not do anything else - * and exit, so return 1 so the callers don't try to use other copies. - */ - if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, - logical+len); - return 1; - } - - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu", - logical, logical+len, em->start, - em->start + em->len); - free_extent_map(em); + em = get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + /* + * We could return errors for these cases, but that could get + * ugly and we'd probably do the same thing which is just not do + * anything else and exit, so return 1 so the callers don't try + * to use other copies. + */ return 1; - } map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) @@ -5179,15 +5165,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; unsigned long len = fs_info->sectorsize; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + BUG_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); @@ -5195,20 +5177,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num) { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + BUG_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; @@ -5328,8 +5306,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; u64 stripe_end_offset; @@ -5351,23 +5327,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, *length); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu len %llu", - logical, *length); - return -EINVAL; - } - - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu, found %Lu-%Lu", - logical, em->start, em->start + em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, logical, *length); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; offset = logical - em->start; @@ -5903,8 +5865,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; struct extent_map *em; struct map_lookup *map; u64 *buf; @@ -5914,24 +5874,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 rmap_len; int i, j, nr = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_start, 1); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_err(fs_info, "couldn't find em for chunk %Lu", - chunk_start); + em = get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) return -EIO; - } - if (em->start != chunk_start) { - btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu", - em->start, chunk_start); - free_extent_map(em); - return -EIO; - } map = em->map_lookup; - length = em->len; rmap_len = map->stripe_len; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ac0bf7d0df60..9ee6b706db31 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -475,7 +475,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct btrfs_mapping_tree *map_tree, -- cgit v1.2.3 From 0b3d4cd371edb6c4ef47f3fb0cb55bdd9aa471c1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Mar 2017 13:33:56 -0700 Subject: Btrfs: separate DISCARD from __btrfs_map_block Since DISCARD is not as important as an operation like write, we don't copy it to target device during replace, and it makes __btrfs_map_block less complex. Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 289 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 175 insertions(+), 114 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 122201897bee..d777d626aa87 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5298,6 +5298,158 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) kfree(bbio); } +/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ +/* + * Please note that, discard won't be sent to target device of device + * replace. + */ +static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + struct btrfs_bio **bbio_ret) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_bio *bbio; + u64 offset; + u64 stripe_nr; + u64 stripe_nr_end; + u64 stripe_end_offset; + u64 stripe_cnt; + u64 stripe_len; + u64 stripe_offset; + u64 num_stripes; + u32 stripe_index; + u32 factor = 0; + u32 sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + int ret = 0; + int i; + + /* discard always return a bbio */ + ASSERT(bbio_ret); + + em = get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* we don't discard raid56 yet */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out; + } + + offset = logical - em->start; + length = min_t(u64, em->len - offset, length); + + stripe_len = map->stripe_len; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + stripe_nr = div64_u64(offset, stripe_len); + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_nr * stripe_len; + + stripe_nr_end = round_up(offset + length, map->stripe_len); + stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); + stripe_cnt = stripe_nr_end - stripe_nr; + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + length); + /* + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + num_stripes = 1; + stripe_index = 0; + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + num_stripes = min_t(u64, map->num_stripes, + sub_stripes * stripe_cnt); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index *= sub_stripes; + stripes_per_dev = div_u64_rem(stripe_cnt, factor, + &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = map->num_stripes; + } else { + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); + } + + bbio = alloc_btrfs_bio(num_stripes, 0); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + bbio->stripes[i].length -= + stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + bbio->stripes[i].length -= + stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else { + bbio->stripes[i].length = length; + } + + stripe_index++; + if (stripe_index == map->num_stripes) { + stripe_index = 0; + stripe_nr++; + } + } + + *bbio_ret = bbio; + bbio->map_type = map->type; + bbio->num_stripes = num_stripes; +out: + free_extent_map(em); + return ret; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5308,10 +5460,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct map_lookup *map; u64 offset; u64 stripe_offset; - u64 stripe_end_offset; u64 stripe_nr; - u64 stripe_nr_orig; - u64 stripe_nr_end; u64 stripe_len; u32 stripe_index; int i; @@ -5327,6 +5476,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + *length, bbio_ret); + em = get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5368,14 +5521,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, raid56_full_stripe_start *= full_stripe_len; } - if (op == BTRFS_MAP_DISCARD) { - /* we don't discard raid56 yet */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = -EOPNOTSUPP; - goto out; - } - *length = min_t(u64, em->len - offset, *length); - } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { u64 max_len; /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single @@ -5406,8 +5552,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { + op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS && + dev_replace->tgtdev != NULL) { /* * in dev-replace case, for repair case (that's the only * case where the mirror is selected explicitly when @@ -5487,24 +5633,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, num_stripes = 1; stripe_index = 0; - stripe_nr_orig = stripe_nr; - stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); - stripe_end_offset = stripe_nr_end * map->stripe_len - - (offset + *length); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->num_stripes, - stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) + if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5517,8 +5652,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) { + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5534,10 +5668,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->sub_stripes * - (stripe_nr_end - stripe_nr_orig), - map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { @@ -5580,8 +5710,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) + if ((op != BTRFS_MAP_WRITE && + op != BTRFS_MAP_GET_READ_MIRRORS) && + mirror_num <= 1) mirror_num = 1; } } else { @@ -5604,7 +5735,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) + if (op == BTRFS_MAP_WRITE) num_alloc_stripes <<= 1; if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++; @@ -5647,84 +5778,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, RAID6_Q_STRIPE; } - if (op == BTRFS_MAP_DISCARD) { - u32 factor = 0; - u32 sub_stripes = 0; - u64 stripes_per_dev = 0; - u32 remaining_stripes = 0; - u32 last_stripe = 0; - - if (map->type & - (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - if (map->type & BTRFS_BLOCK_GROUP_RAID0) - sub_stripes = 1; - else - sub_stripes = map->sub_stripes; - - factor = map->num_stripes / sub_stripes; - stripes_per_dev = div_u64_rem(stripe_nr_end - - stripe_nr_orig, - factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; - } - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * - map->stripe_len; - - if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; - - /* - * Special for the first stripe and - * the last stripe: - * - * |-------|...|-------| - * |----------| - * off end_off - */ - if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; - - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; - - if (i == sub_stripes - 1) - stripe_offset = 0; - } else - bbio->stripes[i].length = *length; - - stripe_index++; - if (stripe_index == map->num_stripes) { - /* This could only happen for RAID0/10 */ - stripe_index = 0; - stripe_nr++; - } - } - } else { - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; - } + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; } if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) @@ -5734,8 +5796,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, sort_parity_stripes(bbio, num_stripes); tgtdev_indexes = 0; - if (dev_replace_is_ongoing && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && + if (dev_replace_is_ongoing && op == BTRFS_MAP_WRITE && dev_replace->tgtdev != NULL) { int index_where_to_add; u64 srcdev_devid = dev_replace->srcdev->devid; -- cgit v1.2.3 From 5ab56090b8824c79748931f322f71da1d702fa08 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Mar 2017 13:33:57 -0700 Subject: Btrfs: introduce a function to get extra mirror from replace As the part of getting extra mirror in __btrfs_map_block is independent, this puts it into a separate function. Reviewed-by: Qu Wenruo Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 161 +++++++++++++++++++++++++++++------------------------ 1 file changed, 89 insertions(+), 72 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d777d626aa87..8865e440736a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, + enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, + int mirror_num, int need_raid_map); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -5450,6 +5455,83 @@ out: return ret; } +/* + * In dev-replace case, for repair case (that's the only case where the mirror + * is selected explicitly when calling btrfs_map_block), blocks left of the + * left cursor can also be read from the target drive. + * + * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the + * array of stripes. + * For READ, it also needs to be supported using the same mirror number. + * + * If the requested block is not left of the left cursor, EIO is returned. This + * can happen because btrfs_num_copies() returns one more in the dev-replace + * case. + */ +static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + u64 srcdev_devid, int *mirror_num, + u64 *physical) +{ + struct btrfs_bio *bbio = NULL; + int num_stripes; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + int i; + int ret = 0; + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + logical, &length, &bbio, 0, 0); + if (ret) { + ASSERT(bbio == NULL); + return ret; + } + + num_stripes = bbio->num_stripes; + if (*mirror_num > num_stripes) { + /* + * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, + * that means that the requested area is not left of the left + * cursor + */ + btrfs_put_bbio(bbio); + return -EIO; + } + + /* + * process the rest of the function using the mirror_num of the source + * drive. Therefore look it up first. At the end, patch the device + * pointer to the one of the target drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add the + * mirror with the lowest physical address + */ + if (found && + physical_of_found <= bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + + btrfs_put_bbio(bbio); + + ASSERT(found); + if (!found) + return -EIO; + + *mirror_num = index_srcdev + 1; + *physical = physical_of_found; + return ret; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5554,79 +5636,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { - /* - * in dev-replace case, for repair case (that's the only - * case where the mirror is selected explicitly when - * calling btrfs_map_block), blocks left of the left cursor - * can also be read from the target drive. - * For REQ_GET_READ_MIRRORS, the target drive is added as - * the last one to the array of stripes. For READ, it also - * needs to be supported using the same mirror number. - * If the requested block is not left of the left cursor, - * EIO is returned. This can happen because btrfs_num_copies() - * returns one more in the dev-replace case. - */ - u64 tmp_length = *length; - struct btrfs_bio *tmp_bbio = NULL; - int tmp_num_stripes; - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, 0); - if (ret) { - WARN_ON(tmp_bbio != NULL); - goto out; - } - - tmp_num_stripes = tmp_bbio->num_stripes; - if (mirror_num > tmp_num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this - * mirror, that means that the requested area - * is not left of the left cursor - */ - ret = -EIO; - btrfs_put_bbio(tmp_bbio); - goto out; - } - - /* - * process the rest of the function using the mirror_num - * of the source drive. Therefore look it up first. - * At the end, patch the device pointer to the one of the - * target drive. - */ - for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add - * the mirror with the lowest physical address - */ - if (found && - physical_of_found <= tmp_bbio->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = tmp_bbio->stripes[i].physical; - } - - btrfs_put_bbio(tmp_bbio); - - if (!found) { - WARN_ON(1); - ret = -EIO; + ret = get_extra_mirror_from_replace(fs_info, logical, *length, + dev_replace->srcdev->devid, + &mirror_num, + &physical_to_patch_in_first_stripe); + if (ret) goto out; - } - - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; + else + patch_the_first_stripe_for_dev_replace = 1; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } -- cgit v1.2.3 From 73c0f228250ff7fc1d662b64123f2aea9d04ca7e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Mar 2017 13:33:58 -0700 Subject: Btrfs: handle operations for device replace separately Since this part is mostly independent, this moves it to a separate function. Reviewed-by: Qu Wenruo Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 179 +++++++++++++++++++++++++++++------------------------ 1 file changed, 98 insertions(+), 81 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8865e440736a..f27f9cebce24 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5532,6 +5532,100 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, return ret; } +static void handle_ops_on_dev_replace(enum btrfs_map_op op, + struct btrfs_bio **bbio_ret, + struct btrfs_dev_replace *dev_replace, + int *num_stripes_ret, int *max_errors_ret) +{ + struct btrfs_bio *bbio = *bbio_ret; + u64 srcdev_devid = dev_replace->srcdev->devid; + int tgtdev_indexes = 0; + int num_stripes = *num_stripes_ret; + int max_errors = *max_errors_ret; + int i; + + if (op == BTRFS_MAP_WRITE) { + int index_where_to_add; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk to + * the new disk takes place at run time while the filesystem is + * mounted writable, the regular write operations to the old + * disk have to be duplicated to go to the new disk as well. + * + * Note that device->missing is handled by the caller, and that + * the write to the old disk is already set up in the stripes + * array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; + index_where_to_add++; + max_errors++; + tgtdev_indexes++; + } + } + num_stripes = index_where_to_add; + } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can also + * be used to read data in case it is needed to repair a corrupt + * block elsewhere. This is possible if the requested area is + * left of the left cursor. In this area, the target drive is a + * full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it simple, + * only add the mirror with the lowest physical + * address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + + tgtdev_indexes++; + num_stripes++; + } + } + + *num_stripes_ret = num_stripes; + *max_errors_ret = max_errors; + bbio->num_tgtdevs = tgtdev_indexes; + *bbio_ret = bbio; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5812,86 +5906,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); - tgtdev_indexes = 0; - if (dev_replace_is_ongoing && op == BTRFS_MAP_WRITE && - dev_replace->tgtdev != NULL) { - int index_where_to_add; - u64 srcdev_devid = dev_replace->srcdev->devid; - - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk - * to the new disk takes place at run time while the - * filesystem is mounted writable, the regular write - * operations to the old disk have to be duplicated to go - * to the new disk as well. - * Note that device->missing is handled by the caller, and - * that the write to the old disk is already set up in the - * stripes array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; - - new->physical = old->physical; - new->length = old->length; - new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && - op == BTRFS_MAP_GET_READ_MIRRORS && - dev_replace->tgtdev != NULL) { - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - /* - * During the dev-replace procedure, the target drive can - * also be used to read data in case it is needed to repair - * a corrupt block elsewhere. This is possible if the - * requested area is left of the left cursor. In this area, - * the target drive is a full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bbio->stripes[i].physical; - } - } - if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; - - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; - - tgtdev_indexes++; - num_stripes++; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)) { + handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, + &max_errors); } *bbio_ret = bbio; @@ -5899,7 +5917,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; - bbio->num_tgtdevs = tgtdev_indexes; /* * this is the case that REQ_READ && dev_replace_is_ongoing && -- cgit v1.2.3 From 6fad823f4998cdff49c443891a0acc9f1bc289f9 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Mar 2017 13:33:59 -0700 Subject: Btrfs: do not add extra mirror when dev_replace target dev is not available With this, we can avoid allocating memory for dev replace copies if the target dev is not available. Reviewed-by: Qu Wenruo Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f27f9cebce24..8cc40268c50c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5157,7 +5157,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) free_extent_map(em); btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && + fs_info->dev_replace.tgtdev) ret++; btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -5845,7 +5846,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { if (op == BTRFS_MAP_WRITE) num_alloc_stripes <<= 1; if (op == BTRFS_MAP_GET_READ_MIRRORS) @@ -5858,7 +5859,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); /* build raid_map */ -- cgit v1.2.3 From 2b19a1fef7be743505f7af6ae47065a2253d6f4e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Mar 2017 13:34:00 -0700 Subject: Btrfs: helper for ops that requires full stripe This adds a helper to show directly whether ops require full stripe. Reviewed-by: Qu Wenruo Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8cc40268c50c..259bd2120da5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5627,6 +5627,11 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, *bbio_ret = bbio; } +static bool need_full_stripe(enum btrfs_map_op op) +{ + return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5729,8 +5734,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS && - dev_replace->tgtdev != NULL) { + !need_full_stripe(op) && dev_replace->tgtdev != NULL) { ret = get_extra_mirror_from_replace(fs_info, logical, *length, dev_replace->srcdev->devid, &mirror_num, @@ -5863,10 +5867,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); /* build raid_map */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && - ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || - mirror_num > 1)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5901,14 +5903,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, stripe_index++; } - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)) { + need_full_stripe(op)) { handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, &max_errors); } -- cgit v1.2.3 From 539b50d2f68944e7c1184c68da5c535499cbccfe Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 Mar 2017 13:34:01 -0700 Subject: Btrfs: convert BUG_ON to WARN_ON These two BUG_ON()s would never be true, ensured by callers' logic. Reviewed-by: Qu Wenruo Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 259bd2120da5..13574f37dc36 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5174,7 +5174,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, unsigned long len = fs_info->sectorsize; em = get_chunk_map(fs_info, logical, len); - BUG_ON(IS_ERR(em)); + WARN_ON(IS_ERR(em)); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -5191,7 +5191,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, int ret = 0; em = get_chunk_map(fs_info, logical, len); - BUG_ON(IS_ERR(em)); + WARN_ON(IS_ERR(em)); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) -- cgit v1.2.3 From 1a79c1f2466bb54b329155b2b6910f386dc5aa9d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 6 Mar 2017 13:49:02 -0800 Subject: Btrfs: update comments in cache_save_setup We also don't bother to flush free space cache while with free space tree. Cc: David Sterba Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5a7ddcff406b..1b9ede123ffb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3443,7 +3443,8 @@ again: /* * don't bother trying to write stuff out _if_ * a) we're not cached, - * b) we're with nospace_cache mount option. + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); -- cgit v1.2.3 From 4d339d01063a248b3deb3aff94aaf53be6e84e08 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Sat, 4 Mar 2017 12:32:50 -0600 Subject: btrfs: No need to check !(flags & MS_RDONLY) twice Code cleanup. The code block is for !(*flags & MS_RDONLY). We don't need to check it again. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9530a333d302..6346876c97ea 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1788,8 +1788,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(*flags & MS_RDONLY)) { + fs_info->num_tolerated_disk_barrier_failures) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; -- cgit v1.2.3 From 261cc2cca0a8c1d817be65434052feb1db1fd961 Mon Sep 17 00:00:00 2001 From: Hans van Kranenburg Date: Wed, 8 Mar 2017 18:58:43 +0100 Subject: Btrfs: consistent usage of types in balance_args The btrfs_balance_args are only used for the balance ioctl, so use __u instead of __le here for consistency. The __le usage was introduced in bc3094673f22d and dee32d0ac3719 and was probably a result of copy/pasting when the code was written. The usage of __le did not break anything, but it's unnecessary. Also, this change makes the code less confusing for the careful reader. Signed-off-by: Hans van Kranenburg Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index dcfc3a5a9cb1..a456e5309238 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -291,10 +291,10 @@ struct btrfs_ioctl_feature_flags { struct btrfs_balance_args { __u64 profiles; union { - __le64 usage; + __u64 usage; struct { - __le32 usage_min; - __le32 usage_max; + __u32 usage_min; + __u32 usage_max; }; }; __u64 devid; @@ -324,8 +324,8 @@ struct btrfs_balance_args { * Process chunks that cross stripes_min..stripes_max devices, * BTRFS_BALANCE_ARGS_STRIPES_RANGE */ - __le32 stripes_min; - __le32 stripes_max; + __u32 stripes_min; + __u32 stripes_max; __u64 unused[6]; } __attribute__ ((__packed__)); -- cgit v1.2.3 From cc8385b59e17d489814d54d6a846aba7051fdea5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Mar 2017 18:54:52 +0100 Subject: btrfs: preallocate radix tree node for readahead We can preallocate the node so insertion does not have to do that under the lock. The GFP flags for the per-device radix tree are initialized to GFP_NOFS & ~__GFP_DIRECT_RECLAIM but we can use GFP_KERNEL, same as an allocation above anyway, but also because readahead is optional and not on any critical writeout path. Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/reada.c | 7 +++++++ fs/btrfs/volumes.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e88bca87f5d2..fdae8ca79401 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -270,6 +270,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, if (!zone) return NULL; + ret = radix_tree_preload(GFP_KERNEL); + if (ret) { + kfree(zone); + return NULL; + } + zone->start = start; zone->end = end; INIT_LIST_HEAD(&zone->list); @@ -299,6 +305,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, zone = NULL; } spin_unlock(&fs_info->reada_lock); + radix_tree_preload_end(); return zone; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 13574f37dc36..df50a63bbc3f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -252,7 +252,7 @@ static struct btrfs_device *__alloc_device(void) atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&dev->reada_zones, GFP_KERNEL); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); return dev; -- cgit v1.2.3 From 7ef70b4d9987a78d39c4e40a02093493333e5408 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Mar 2017 18:54:52 +0100 Subject: btrfs: preallocate radix tree node for global readahead tree We can preallocate the node so insertion does not have to do that under the lock. The GFP flags for the global radix tree are initialized to GFP_NOFS & ~__GFP_DIRECT_RECLAIM but we can use GFP_KERNEL, because readahead is optional and not on any critical writeout path. Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/reada.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd415e1dd114..12eba54877c2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2693,7 +2693,7 @@ int open_ctree(struct super_block *sb, fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_KERNEL); spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index fdae8ca79401..a7fa4a5cb296 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -393,6 +393,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + ret = radix_tree_preload(GFP_KERNEL); + if (ret) + goto error; + /* insert extent in reada_tree + all per-device trees, all or nothing */ btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); @@ -402,13 +406,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } + radix_tree_preload_end(); prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); -- cgit v1.2.3 From d48d71aa9977473b6515bb48933617a06cdc7be9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Mar 2017 19:43:30 +0100 Subject: btrfs: remove redundant parameter from btree_readahead_hook We can read fs_info from eb. Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/reada.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 285566cc2f7d..7343bf7f2b35 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3672,8 +3672,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err); +int btree_readahead_hook(struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 12eba54877c2..a9314fe494c9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -762,7 +762,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(fs_info, eb, ret); + btree_readahead_hook(eb, ret); if (ret) { /* @@ -787,7 +787,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb->fs_info, eb, -EIO); + btree_readahead_hook(eb, -EIO); return -EIO; /* we fixed nothing */ } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a7fa4a5cb296..306e5108aac7 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -209,9 +209,9 @@ cleanup: return; } -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err) +int btree_readahead_hook(struct extent_buffer *eb, int err) { + struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; struct reada_extent *re; -- cgit v1.2.3 From 0ceaf282137f261abf634f96b15c535baa6a9c93 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Mar 2017 19:43:30 +0100 Subject: btrfs: remove redundant parameter from reada_find_zone We can read fs_info from dev. Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/reada.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 306e5108aac7..2deba18886db 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -235,10 +235,10 @@ start_machine: return ret; } -static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, u64 logical, +static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = dev->fs_info; int ret; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; @@ -374,7 +374,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!dev->bdev) continue; - zone = reada_find_zone(fs_info, dev, logical, bbio); + zone = reada_find_zone(dev, logical, bbio); if (!zone) continue; -- cgit v1.2.3 From 5721b8ad260f6ed15e3c82447981dd7a2f4bdf72 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Mar 2017 19:43:30 +0100 Subject: btrfs: remove redundant parameter from reada_start_machine_dev We can read fs_info from dev. Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/reada.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 2deba18886db..5c1e6bbed132 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -653,9 +653,9 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } -static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +static int reada_start_machine_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct reada_extent *re = NULL; int mirror_num = 0; struct extent_buffer *eb = NULL; @@ -768,8 +768,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(fs_info, - device); + enqueued += reada_start_machine_dev(device); } mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; -- cgit v1.2.3 From 994a5d2bc758792f1808d7b112990a14182ef847 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 15 Mar 2017 16:39:59 +0100 Subject: btrfs: remove local blocksize variable in reada_find_extent The name is misleading and the local variable serves no purpose. Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/reada.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 5c1e6bbed132..a17e775a4a89 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -320,7 +320,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; - u32 blocksize; u64 length; int real_stripes; int nzones = 0; @@ -341,7 +340,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!re) return NULL; - blocksize = fs_info->nodesize; re->logical = logical; re->top = *top; INIT_LIST_HEAD(&re->extctl); @@ -351,10 +349,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, /* * map block */ - length = blocksize; + length = fs_info->nodesize; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &length, &bbio, 0); - if (ret || !bbio || length < blocksize) + if (ret || !bbio || length < fs_info->nodesize) goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { -- cgit v1.2.3 From f486135ebab4fb91366a1e41fb15ed3036ad0cf9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 15 Mar 2017 16:17:03 +0100 Subject: btrfs: remove unused qgroup members from btrfs_trans_handle The members have been effectively unused since "Btrfs: rework qgroup accounting" (fcebe4562dec83b3), there's no substitute for assert_qgroups_uptodate so it's removed as well. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 - fs/btrfs/qgroup.c | 12 ------------ fs/btrfs/qgroup.h | 1 - fs/btrfs/tests/btrfs-tests.c | 1 - fs/btrfs/transaction.c | 3 --- fs/btrfs/transaction.h | 2 -- 6 files changed, 20 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1b9ede123ffb..e870e60e33bc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3003,7 +3003,6 @@ again: goto again; } out: - assert_qgroups_uptodate(trans); trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a59801dc2a34..b2fdefc6d191 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2487,18 +2487,6 @@ out: spin_unlock(&fs_info->qgroup_lock); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) -{ - if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) - return; - btrfs_err(trans->fs_info, - "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x", - trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); -} - /* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 26932a8a1993..96fc56ebf55a 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -196,7 +196,6 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ea272432c930..b18ab8f327a5 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) { memset(trans, 0, sizeof(*trans)); trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a7d7a7d1d78a..ee5844855c15 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -572,7 +572,6 @@ again: h->type = type; h->can_flush_pending_bgs = true; - INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); smp_mb(); @@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up_process(info->transaction_kthread); err = -EIO; } - assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { @@ -2223,7 +2221,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) switch_commit_roots(cur_trans, fs_info); - assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 902619f83db6..c55e44560103 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -127,8 +127,6 @@ struct btrfs_trans_handle { unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; - struct seq_list delayed_ref_elem; - struct list_head qgroup_ref_list; struct list_head new_bgs; }; -- cgit v1.2.3 From f58d88b336c1c83a35d6b2582d45cd87cb2ca406 Mon Sep 17 00:00:00 2001 From: Edmund Nadolski Date: Thu, 16 Mar 2017 10:04:33 -0600 Subject: btrfs: provide enumeration for __merge_refs mode argument Replace hardcoded numeric values for __merge_refs 'mode' argument with descriptive constants. Signed-off-by: Edmund Nadolski Reviewed-by: Jeff Mahoney Signed-off-by: David Sterba --- fs/btrfs/backref.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 116338344224..6ce7281749a8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -26,6 +26,11 @@ #include "delayed-ref.h" #include "locking.h" +enum merge_mode { + MERGE_IDENTICAL_KEYS = 1, + MERGE_IDENTICAL_PARENTS, +}; + /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, /* * merge backrefs and adjust counts accordingly * - * mode = 1: merge identical keys, if key is set - * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. - * additionally, we could even add a key range for the blocks we - * looked into to merge even more (-> replace unresolved refs by those - * having a parent). - * mode = 2: merge identical parents + * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref + * then we can merge more here. Additionally, we could even add a key + * range for the blocks we looked into to merge even more (-> replace + * unresolved refs by those having a parent). */ -static void __merge_refs(struct list_head *head, int mode) +static void __merge_refs(struct list_head *head, enum merge_mode mode) { struct __prelim_ref *pos1; @@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode) if (!ref_for_same_block(ref1, ref2)) continue; - if (mode == 1) { + if (mode == MERGE_IDENTICAL_KEYS) { if (!ref1->parent && ref2->parent) swap(ref1, ref2); } else { @@ -1374,7 +1377,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 1); + __merge_refs(&prefs, MERGE_IDENTICAL_KEYS); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, extent_item_pos, total_refs, @@ -1382,7 +1385,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 2); + __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); -- cgit v1.2.3 From de47c9d3ff875a3cb1251af35ee8d30afaca78bd Mon Sep 17 00:00:00 2001 From: Edmund Nadolski Date: Thu, 16 Mar 2017 10:04:34 -0600 Subject: btrfs: replace hardcoded value with SEQ_LAST macro Define the SEQ_LAST macro to replace (u64)-1 in places where said value triggers a special-case ref search behavior. Signed-off-by: Edmund Nadolski Reviewed-by: Jeff Mahoney Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/backref.c | 16 ++++++++-------- fs/btrfs/ctree.h | 2 ++ fs/btrfs/qgroup.c | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6ce7281749a8..24865da63d8f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -538,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * slot==nritems. In that case, go to the next leaf before we continue. */ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -582,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -634,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == (u64)-1) + else if (time_seq == SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -645,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 0, 0); else @@ -1199,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * - * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave * much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). @@ -1246,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) path->skip_locking = 1; /* @@ -1276,9 +1276,9 @@ again: #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != (u64)-1) { + time_seq != SEQ_LAST) { #else - if (trans && time_seq != (u64)-1) { + if (trans && time_seq != SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7343bf7f2b35..e9d77115e695 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -659,6 +659,8 @@ struct seq_list { #define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define SEQ_LAST ((u64)-1) + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b2fdefc6d191..21647a414aa5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2055,12 +2055,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* - * Use (u64)-1 as time_seq to do special search, which + * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, (u64)-1, &new_roots); + record->bytenr, SEQ_LAST, &new_roots); if (ret < 0) goto cleanup; if (qgroup_to_skip) -- cgit v1.2.3 From 48a89bc4f2ceab87bc858a8eb189636b09c846a7 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 27 Mar 2017 12:29:57 -0500 Subject: btrfs: qgroups: Retry after commit on getting EDQUOT We are facing the same problem with EDQUOT which was experienced with ENOSPC. Not sure if we require a full ticketing system such as ENOSPC, but here is a quick fix, which may be too big a hammer. Quotas are reserved during the start of an operation, incrementing qg->reserved. However, it is written to disk in a commit_transaction which could take as long as commit_interval. In the meantime there could be deletions which are not accounted for because deletions are accounted for only while committed (free_refroot). So, when we get a EDQUOT flush the data to disk and try again. This fixes fstests btrfs/139. Here is a sample script which shows this issue. DEVICE=/dev/vdb MOUNTPOINT=/mnt TESTVOL=$MOUNTPOINT/tmp QUOTA=5 PROG=btrfs DD_BS="4k" DD_COUNT="256" RUN_TIMES=5000 mkfs.btrfs -f $DEVICE mount -o commit=240 $DEVICE $MOUNTPOINT $PROG subvolume create $TESTVOL $PROG quota enable $TESTVOL $PROG qgroup limit ${QUOTA}G $TESTVOL typeset -i DD_RUN_GOOD typeset -i QUOTA function _check_cmd() { if [[ ${?} > 0 ]]; then echo -n "$(date) E: Running previous command" echo ${*} echo "Without sync" $PROG qgroup show -pcreFf ${TESTVOL} echo "With sync" $PROG qgroup show -pcreFf --sync ${TESTVOL} exit 1 fi } while true; do DD_RUN_GOOD=$RUN_TIMES while (( ${DD_RUN_GOOD} != 0 )); do dd if=/dev/zero of=${TESTVOL}/quotatest${DD_RUN_GOOD} bs=${DD_BS} count=${DD_COUNT} _check_cmd "dd if=/dev/zero of=${TESTVOL}/quotatest${DD_RUN_GOOD} bs=${DD_BS} count=${DD_COUNT}" DD_RUN_GOOD=(${DD_RUN_GOOD}-1) done $PROG qgroup show -pcref $TESTVOL echo "----------- Cleanup ---------- " rm $TESTVOL/quotatest* done Signed-off-by: Goldwyn Rodrigues Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 21647a414aa5..1d7942c5a38a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2367,6 +2367,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; + int retried = 0; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2375,7 +2376,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) if (num_bytes == 0) return 0; - +retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; if (!quota_root) @@ -2402,6 +2403,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) qg = unode_aux_to_qgroup(unode); if (enforce && !qgroup_check_limits(qg, num_bytes)) { + /* + * Commit the tree and retry, since we may have + * deletions which would free up space. + */ + if (!retried && qg->reserved > 0) { + struct btrfs_trans_handle *trans; + + spin_unlock(&fs_info->qgroup_lock); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + retried++; + goto retry; + } ret = -EDQUOT; goto out; } -- cgit v1.2.3 From 171938e528079deced3226a17dcab12121312a64 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 28 Mar 2017 14:44:21 +0200 Subject: btrfs: track exclusive filesystem operation in flags There are several operations, usually started from ioctls, that cannot run concurrently. The status is tracked in mutually_exclusive_operation_running as an atomic_t. We can easily track the status as one of the per-filesystem flag bits with same synchronization guarantees. The conversion replaces: * atomic_xchg(..., 1) -> test_and_set_bit(FLAG, ...) * atomic_set(..., 0) -> clear_bit(FLAG, ...) Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 +++++-- fs/btrfs/dev-replace.c | 5 ++--- fs/btrfs/ioctl.c | 33 +++++++++++++++------------------ fs/btrfs/volumes.c | 6 +++--- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e9d77115e695..70631d773669 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -705,6 +705,11 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +/* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ +#define BTRFS_FS_EXCL_OP 14 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -1070,8 +1075,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - atomic_t mutually_exclusive_operation_running; - struct percpu_counter bio_counter; wait_queue_head_t replace_wait; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e653921f05d9..de7b2c897fe0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -784,8 +784,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) } btrfs_dev_replace_unlock(dev_replace, 1); - WARN_ON(atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); return PTR_ERR_OR_ZERO(task); } @@ -814,7 +813,7 @@ static int btrfs_dev_replace_kthread(void *data) (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dabfc7ac48a6..a29dc3fd7152 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1504,7 +1504,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1619,7 +1619,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; } @@ -2661,7 +2661,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; mutex_lock(&fs_info->volume_mutex); @@ -2680,7 +2680,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2708,7 +2708,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) return -EOPNOTSUPP; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -2721,7 +2721,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = btrfs_rm_device(fs_info, vol_args->name, 0); } mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -2752,7 +2752,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -2772,7 +2772,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out_drop_write: mnt_drop_write_file(file); @@ -4442,13 +4442,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - atomic_set( - &fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -4643,7 +4641,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; @@ -4689,7 +4687,7 @@ again: } locked: - BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); + BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4745,11 +4743,10 @@ locked: do_balance: /* - * Ownership of bctl and mutually_exclusive_operation_running + * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP * goes to to btrfs_balance. bctl is freed in __cancel_balance, * or, if restriper was paused all the way until unmount, in - * free_fs_info. mutually_exclusive_operation_running is - * cleared in __cancel_balance. + * free_fs_info. The flag is cleared in __cancel_balance. */ need_unlock = false; @@ -4769,7 +4766,7 @@ out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); if (need_unlock) - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: mnt_drop_write_file(file); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index df50a63bbc3f..2e425a909125 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3761,7 +3761,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } /* Non-zero return value signifies invalidity */ @@ -3941,7 +3941,7 @@ out: __cancel_balance(fs_info); else { kfree(bctl); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } return ret; } @@ -4031,7 +4031,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); -- cgit v1.2.3 From 1bcd7aa17fc1fa29009feec3e0ac4230d0d9f700 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 29 Mar 2017 10:55:16 -0700 Subject: Btrfs: set scrub page's io_error if failing to submit io Scrub repairs data by the unit called scrub_block, which may contain several pages. Scrub always tries to look up a good copy of a whole block, but if there's no such copy, it tries to do repair page by page. If we don't set page's io_error when checking this bad copy, in the last step, we may skip this page when repairing bad copy from good copy. Cc: David Sterba Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6ba0c9ddf777..f499ed782671 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1497,14 +1497,18 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { - if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } else { bio->bi_iter.bi_sector = page->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(bio)) + if (btrfsic_submit_bio_wait(bio)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } bio_put(bio); -- cgit v1.2.3 From abad60c6013d62f4ddd19667260e2e4a957a9c2a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 29 Mar 2017 10:54:26 -0700 Subject: Btrfs: fix wrong failed mirror_num of read-repair on raid56 In raid56 scenario, after trying parity recovery, we didn't set mirror_num for btrfs_bio with failed mirror_num, hence end_bio_extent_readpage() will report a random mirror_num in dmesg log. Cc: David Sterba Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a8954f5188b4..7bf65d9a9bd3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2117,6 +2117,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_raid_bio *rbio; int ret; + if (generic_io) { + ASSERT(bbio->mirror_num == mirror_num); + btrfs_io_bio(bio)->mirror_num = mirror_num; + } + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) { if (generic_io) -- cgit v1.2.3 From 176ef8f5e662a79471f437839cd9a0c5b95b1e8d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 28 Mar 2017 14:35:01 +0200 Subject: btrfs: sink GFP flags parameter to tree_mod_log_insert_move All (1) callers pass the same value. Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7dc8844037e0..d034d47c5470 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, - int nr_items, gfp_t flags) + int nr_items) { struct tree_mod_elem *tm = NULL; struct tree_mod_elem **tm_list = NULL; @@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, { int ret; ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, - nr_items, GFP_NOFS); + nr_items); BUG_ON(ret < 0); } -- cgit v1.2.3 From bcc8e07f9ef1cb3a302bed1d41d27ca72eaacc33 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 28 Mar 2017 14:35:42 +0200 Subject: btrfs: sink GFP flags parameter to tree_mod_log_insert_root All (1) callers pass the same value. Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d034d47c5470..165e7ec12af7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags, + struct extent_buffer *new_root, int log_removal) { struct tree_mod_elem *tm = NULL; @@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - flags); + GFP_NOFS); if (!tm_list) { ret = -ENOMEM; goto free_tms; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, } } - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root, { int ret; ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS, log_removal); + new_root_node, log_removal); BUG_ON(ret < 0); } -- cgit v1.2.3 From 825ad4c96432a3908625dd1a79d6528e969e1a09 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 28 Mar 2017 14:45:22 +0200 Subject: btrfs: drop redundant parameters from btrfs_map_sblock All callers pass 0 for mirror_num and 1 for need_raid_map. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 6 +++--- fs/btrfs/volumes.c | 6 ++---- fs/btrfs/volumes.h | 3 +-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f499ed782671..fe1c793877dd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1331,7 +1331,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * represents one mirror */ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio, 0, 1); + logical, &mapped_length, &bbio); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); return -EIO; @@ -2192,7 +2192,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int i; ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2780,7 +2780,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) length = sparity->logic_end - sparity->logic_start; ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2e425a909125..3b3be8eba776 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5952,11 +5952,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map) + struct btrfs_bio **bbio_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, - mirror_num, need_raid_map); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } int btrfs_rmap_block(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9ee6b706db31..def39b5a8d9e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -400,8 +400,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map); + struct btrfs_bio **bbio_ret); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); -- cgit v1.2.3 From 3159fe7baef3a50fc332455e252d8a01a18f1ff1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Mar 2017 15:52:08 +0800 Subject: btrfs: qgroup: Add trace point for qgroup reserved space Introduce the following trace points: qgroup_update_reserve qgroup_meta_reserve These trace points are handy to trace qgroup reserve space related problems. Also export btrfs_qgroup structure, as now we directly pass btrfs_qgroup structure to trace points, so that structure needs to be exported. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 52 +++++++------------------------------------- fs/btrfs/qgroup.h | 44 +++++++++++++++++++++++++++++++++++++ include/trace/events/btrfs.h | 44 +++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 44 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1d7942c5a38a..27a229619808 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -47,50 +47,6 @@ * - check all ioctl parameters */ -/* - * one struct for each qgroup, organized in fs_info->qgroup_tree. - */ -struct btrfs_qgroup { - u64 qgroupid; - - /* - * state - */ - u64 rfer; /* referenced */ - u64 rfer_cmpr; /* referenced compressed */ - u64 excl; /* exclusive */ - u64 excl_cmpr; /* exclusive compressed */ - - /* - * limits - */ - u64 lim_flags; /* which limits are set */ - u64 max_rfer; - u64 max_excl; - u64 rsv_rfer; - u64 rsv_excl; - - /* - * reservation tracking - */ - u64 reserved; - - /* - * lists - */ - struct list_head groups; /* groups this group is member of */ - struct list_head members; /* groups that are members of this group */ - struct list_head dirty; /* dirty groups */ - struct rb_node node; /* tree of qgroups */ - - /* - * temp variables for accounting operations - * Refer to qgroup_shared_accounting() for details. - */ - u64 old_refcnt; - u64 new_refcnt; -}; - static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { @@ -1075,6 +1031,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes); if (WARN_ON(qgroup->reserved < num_bytes)) report_reserved_underflow(fs_info, qgroup, num_bytes); else @@ -1100,6 +1057,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, + -(s64)num_bytes); if (WARN_ON(qgroup->reserved < num_bytes)) report_reserved_underflow(fs_info, qgroup, num_bytes); @@ -2446,6 +2405,7 @@ retry: qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, num_bytes); qg->reserved += num_bytes; } @@ -2491,6 +2451,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes); if (WARN_ON(qg->reserved < num_bytes)) report_reserved_underflow(fs_info, qg, num_bytes); else @@ -2955,6 +2916,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); + trace_qgroup_meta_reserve(root, (s64)num_bytes); ret = qgroup_reserve(root, num_bytes, enforce); if (ret < 0) return ret; @@ -2974,6 +2936,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root) reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0); if (reserved == 0) return; + trace_qgroup_meta_reserve(root, -(s64)reserved); btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); } @@ -2988,6 +2951,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); atomic64_sub(num_bytes, &root->qgroup_meta_rsv); + trace_qgroup_meta_reserve(root, -(s64)num_bytes); btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 96fc56ebf55a..31e468b16175 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -61,6 +61,50 @@ struct btrfs_qgroup_extent_record { struct ulist *old_roots; }; +/* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + * Refer to qgroup_shared_accounting() for details. + */ + u64 old_refcnt; + u64 new_refcnt; +}; + /* * For qgroup event trace points only */ diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index cef39e2baf21..e37973526153 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -25,6 +25,7 @@ struct extent_buffer; struct btrfs_work; struct __btrfs_workqueue; struct btrfs_qgroup_extent_record; +struct btrfs_qgroup; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1614,6 +1615,49 @@ TRACE_EVENT(qgroup_update_counters, __entry->cur_new_count) ); +TRACE_EVENT(qgroup_update_reserve, + + TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, + s64 diff), + + TP_ARGS(fs_info, qgroup, diff), + + TP_STRUCT__entry_btrfs( + __field( u64, qgid ) + __field( u64, cur_reserved ) + __field( s64, diff ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->qgid = qgroup->qgroupid; + __entry->cur_reserved = qgroup->reserved; + __entry->diff = diff; + ), + + TP_printk_btrfs("qgid=%llu cur_reserved=%llu diff=%lld", + __entry->qgid, __entry->cur_reserved, __entry->diff) +); + +TRACE_EVENT(qgroup_meta_reserve, + + TP_PROTO(struct btrfs_root *root, s64 diff), + + TP_ARGS(root, diff), + + TP_STRUCT__entry_btrfs( + __field( u64, refroot ) + __field( s64, diff ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->refroot = root->objectid; + __entry->diff = diff; + ), + + TP_printk_btrfs("refroot=%llu(%s) diff=%lld", + show_root_type(__entry->refroot), __entry->diff) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ -- cgit v1.2.3 From d51ea5dd222d13dc46c76b2446aa59c4183e3922 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Mar 2017 15:52:09 +0800 Subject: btrfs: qgroup: Re-arrange tracepoint timing to co-operate with reserved space tracepoint Newly introduced qgroup reserved space trace points are normally nested into several common qgroup operations. While some other trace points are not well placed to co-operate with them, causing confusing output. This patch re-arrange trace_btrfs_qgroup_release_data() and trace_btrfs_qgroup_free_delayed_ref() trace points so they are triggered before reserved space ones. Signed-off-by: Qu Wenruo Reviewed-by: Jeff Mahoney Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 10 +++++----- fs/btrfs/qgroup.h | 6 +----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 27a229619808..3f75b5cbbfef 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2857,14 +2857,14 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, if (ret < 0) goto out; - if (free) { - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + if (free) trace_op = QGROUP_FREE; - } trace_btrfs_qgroup_release_data(inode, start, len, changeset.bytes_changed, trace_op); + if (free) + btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, + BTRFS_I(inode)->root->objectid, + changeset.bytes_changed); out: ulist_release(&changeset.range_changed); return ret; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 31e468b16175..fe04d3f295c6 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -230,15 +230,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes); -/* - * TODO: Add proper trace point for it, as btrfs_qgroup_free() is - * called by everywhere, can't provide good trace for delayed ref case. - */ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -- cgit v1.2.3 From 9a33944bdf075ca93062cde206cb25e62044890e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 29 Mar 2017 09:33:20 +0800 Subject: btrfs: scrub: Don't append on-disk pages for raid56 scrub In the following situation, scrub will calculate wrong parity to overwrite the correct one: RAID5 full stripe: Before | Dev 1 | Dev 2 | Dev 3 | | Data stripe 1 | Data stripe 2 | Parity Stripe | --------------------------------------------------- 0 | 0x0000 (Bad) | 0xcdcd | 0x0000 | --------------------------------------------------- 4K | 0xcdcd | 0xcdcd | 0x0000 | ... | 0xcdcd | 0xcdcd | 0x0000 | --------------------------------------------------- 64K After scrubbing dev3 only: | Dev 1 | Dev 2 | Dev 3 | | Data stripe 1 | Data stripe 2 | Parity Stripe | --------------------------------------------------- 0 | 0xcdcd (Good) | 0xcdcd | 0xcdcd (Bad) | --------------------------------------------------- 4K | 0xcdcd | 0xcdcd | 0x0000 | ... | 0xcdcd | 0xcdcd | 0x0000 | --------------------------------------------------- 64K The reason is that after raid56 read rebuild rbio->stripe_pages are all correctly recovered (0xcd for data stripes). However when we check and repair parity in scrub_parity_check_and_repair(), we will append pages in sparity->spages list to rbio->bio_pages[], which contains old on-disk data. And when we submit parity data to disk, we calculate parity using rbio->bio_pages[] first, if rbio->bio_pages[] not found, then fallback to rbio->stripe_pages[]. The patch fix it by not appending pages from sparity->spages. So finish_parity_scrub() will use rbio->stripe_pages[] which is correct. Signed-off-by: Qu Wenruo Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fe1c793877dd..26dbe563fed0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2769,7 +2769,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct scrub_page *spage; struct btrfs_bio *bbio = NULL; u64 length; int ret; @@ -2799,9 +2798,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (!rbio) goto rbio_out; - list_for_each_entry(spage, &sparity->spages, list) - raid56_add_scrub_pages(rbio, spage->page, spage->logical); - scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); return; -- cgit v1.2.3 From ae6529c35bcc1c65c12131cef2aea63d8e2ea950 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 29 Mar 2017 09:33:21 +0800 Subject: btrfs: Wait for in-flight bios before freeing target device for raid56 When raid56 dev-replace is cancelled by running scrub, we will free target device without waiting for in-flight bios, causing the following NULL pointer deference or general protection failure. BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0 IP: generic_make_request_checks+0x4d/0x610 CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs] task: ffff88002875b4c0 task.stack: ffffc90001334000 RIP: 0010:generic_make_request_checks+0x4d/0x610 Call Trace: ? generic_make_request+0xc7/0x360 generic_make_request+0x24/0x360 ? generic_make_request+0xc7/0x360 submit_bio+0x64/0x120 ? page_in_rbio+0x4d/0x80 [btrfs] ? rbio_orig_end_io+0x80/0x80 [btrfs] finish_rmw+0x3f4/0x540 [btrfs] validate_rbio_for_rmw+0x36/0x40 [btrfs] raid_rmw_end_io+0x7a/0x90 [btrfs] bio_endio+0x56/0x60 end_workqueue_fn+0x3c/0x40 [btrfs] btrfs_scrubparity_helper+0xef/0x620 [btrfs] btrfs_endio_raid56_helper+0xe/0x10 [btrfs] process_one_work+0x2af/0x720 ? process_one_work+0x22b/0x720 worker_thread+0x4b/0x4f0 kthread+0x10f/0x150 ? process_one_work+0x720/0x720 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x2e/0x40 RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8 In btrfs_dev_replace_finishing(), we will call btrfs_rm_dev_replace_blocked() to wait bios before destroying the target device when scrub is finished normally. However when dev-replace is aborted, either due to error or cancelled by scrub, we didn't wait for bios, this can lead to use-after-free if there are bios holding the target device. Furthermore, for raid56 scrub, at least 2 places are calling btrfs_map_sblock() without protection of bio_counter, leading to the problem. This patch fixes the problem: 1) Wait for bio_counter before freeing target device when canceling replace 2) When calling btrfs_map_sblock() for raid56, use bio_counter to protect the call. Cc: Liu Bo Signed-off-by: Qu Wenruo Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 ++ fs/btrfs/raid56.c | 14 ++++++++++++++ fs/btrfs/scrub.c | 5 +++++ 3 files changed, 21 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index de7b2c897fe0..32b2ffbb5717 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return scrub_ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 7bf65d9a9bd3..d8ea0eb76325 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2198,6 +2198,8 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * + * Caller must have already increased bio_counter for getting @bbio. + * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. @@ -2235,6 +2237,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + /* + * We have already increased bio_counter when getting bbio, record it + * so we can free it at rbio_orig_end_io(). + */ + rbio->generic_bio_cnt = 1; + return rbio; } @@ -2677,6 +2685,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, return NULL; } + /* + * When we get bbio, we have already increased bio_counter, record it + * so we can free it at rbio_orig_end_io() + */ + rbio->generic_bio_cnt = 1; + return rbio; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 26dbe563fed0..e9612016325f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2191,6 +2191,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int ret; int i; + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &length, &bbio); if (ret || !bbio || !bbio->raid_map) @@ -2235,6 +2236,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2778,6 +2780,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto out; length = sparity->logic_end - sparity->logic_start; + + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, &length, &bbio); if (ret || !bbio || !bbio->raid_map) @@ -2805,6 +2809,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); -- cgit v1.2.3 From e501bfe323356ea3f7ef79d4b0d95389b70a7193 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 29 Mar 2017 09:33:22 +0800 Subject: btrfs: Prevent scrub recheck from racing with dev replace scrub_setup_recheck_block() calls btrfs_map_sblock() and then accesses bbio without protection of bio_counter. This can lead to use-after-free if racing with dev replace cancel. Fix it by increasing bio_counter before calling btrfs_map_sblock() and decreasing the bio_counter when corresponding recover is finished. Cc: Liu Bo Reported-by: Liu Bo Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e9612016325f..c4d1e60e831e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -860,9 +860,11 @@ static inline void scrub_get_recover(struct scrub_recover *recover) refcount_inc(&recover->refs); } -static inline void scrub_put_recover(struct scrub_recover *recover) +static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, + struct scrub_recover *recover) { if (refcount_dec_and_test(&recover->refs)) { + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(recover->bbio); kfree(recover); } @@ -1241,7 +1243,7 @@ out: sblock->pagev[page_index]->sblock = NULL; recover = sblock->pagev[page_index]->recover; if (recover) { - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); sblock->pagev[page_index]->recover = NULL; } @@ -1330,16 +1332,19 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bbio); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -ENOMEM; } @@ -1365,7 +1370,7 @@ leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); return -ENOMEM; } scrub_page_get(page); @@ -1407,7 +1412,7 @@ leave_nomem: scrub_get_recover(recover); page->recover = recover; } - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; page_index++; -- cgit v1.2.3 From 619a974292387343c817f5a36e9df6daeb3ccc60 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 29 Mar 2017 20:48:44 +0200 Subject: btrfs: use clear_page where appropriate There's a helper to clear whole page, with a arch-specific optimized code. The replaced cases do not seem to be in performace critical code, but we still might get some percent gain. Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/scrub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index da6841efac26..c5e6180cdb8c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_SIZE; if (clear) - memset(io_ctl->cur, 0, PAGE_SIZE); + clear_page(io_ctl->cur); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c4d1e60e831e..48dd6f170c36 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1643,7 +1643,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, if (spage->io_error) { void *mapped_buffer = kmap_atomic(spage->page); - memset(mapped_buffer, 0, PAGE_SIZE); + clear_page(mapped_buffer); flush_dcache_page(spage->page); kunmap_atomic(mapped_buffer); } -- cgit v1.2.3 From c725328c55443f150577efcf445a7924687342e4 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 29 Mar 2017 10:53:58 -0700 Subject: Btrfs: enable repair during read for raid56 profile Now that scrub can fix data errors with the help of parity for raid56 profile, repair during read is able to as well. Although the mirror num in raid56 scenario has different meanings, i.e. 0 or 1: read data directly > 1: do recover with parity, it could be fit into how we repair bad block during read. The trick is to use BTRFS_MAP_READ instead of BTRFS_MAP_WRITE to get the device and position on it. Cc: David Sterba Tested-by: Qu Wenruo Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 77bd016ba18b..d8da3edf2ac3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2009,10 +2009,6 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); - /* we can't repair anything in raid56 yet */ - if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) - return 0; - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -2025,17 +2021,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bbio, 0); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + ASSERT(bbio->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); } - BUG_ON(mirror_num != bbio->mirror_num); - sector = bbio->stripes[mirror_num-1].physical >> 9; + + sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[mirror_num-1].dev; + dev = bbio->stripes[bbio->mirror_num - 1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { btrfs_bio_counter_dec(fs_info); -- cgit v1.2.3 From 972d7219398651aaf7ae086b4f17c3416b51c7db Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 3 Apr 2017 13:45:33 -0700 Subject: Btrfs: update scrub_parity to use u64 stripe_len Commit 3d8da6781760 ("Btrfs: fix divide error upon chunk's stripe_len") changed stripe_len in struct map_lookup to u64, but didn't update stripe_len in struct scrub_parity. This updates the type and switches to div64_u64_rem to match u64 divisor. Cc: David Sterba Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 48dd6f170c36..58c5864bf709 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -140,7 +140,7 @@ struct scrub_parity { int nsectors; - int stripe_len; + u64 stripe_len; refcount_t refs; @@ -2396,7 +2396,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - u32 offset; + u64 offset; int nsectors; int sectorsize = sparity->sctx->fs_info->sectorsize; @@ -2406,8 +2406,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - start = div_u64_rem(start, sparity->stripe_len, &offset); - offset /= sectorsize; + start = div64_u64_rem(start, sparity->stripe_len, &offset); + offset = div_u64(offset, sectorsize); nsectors = (int)len / sectorsize; if (offset + nsectors <= sparity->nsectors) { -- cgit v1.2.3 From 42c61ab6760f5f6929ebf5a73b7e32b9aa51fbd5 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 3 Apr 2017 13:45:24 -0700 Subject: Btrfs: switch to div64_u64 if with a u64 divisor This is fixing code pieces where we use div_u64 when passing a u64 divisor. Cc: David Sterba Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/volumes.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 32b2ffbb5717..5fe1ca8abc70 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -667,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div_u64(dev_replace->cursor_left, + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 58c5864bf709..4786eff53011 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2705,7 +2705,7 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div64_u64(*offset, map->stripe_len); stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ @@ -3108,7 +3108,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; - nstripes = div_u64(length, map->stripe_len); + nstripes = div64_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b3be8eba776..2a4eada25743 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4816,7 +4816,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - stripe_size = div_u64(stripe_size, raid_stripe_len); + stripe_size = div64_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -5361,7 +5361,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, stripe_offset = offset - stripe_nr * stripe_len; stripe_nr_end = round_up(offset + length, map->stripe_len); - stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); + stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); stripe_cnt = stripe_nr_end - stripe_nr; stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + length); @@ -5801,7 +5801,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div_u64(raid56_full_stripe_start, + stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ @@ -5998,7 +5998,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, continue; stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; -- cgit v1.2.3 From e884f4f06e89616ba32badcc7d87762e39a792e5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 4 Apr 2017 18:40:19 +0800 Subject: btrfs: use q which is already obtained from bdev_get_queue We have already assigned q from bdev_get_queue() so use it. And rearrange the code for better view. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2a4eada25743..6cad3233181c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1013,14 +1013,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - fs_devices->rotating = 1; - fs_devices->open_devices++; if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -2422,7 +2421,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->free_chunk_space += device->total_bytes; spin_unlock(&fs_info->free_chunk_lock); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!blk_queue_nonrot(q)) fs_info->fs_devices->rotating = 1; tmp = btrfs_super_total_bytes(fs_info->super_copy); -- cgit v1.2.3 From 82bafb38c2d6bf3b9ab91bde448c08b8154660c1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Feb 2017 15:10:33 +0800 Subject: btrfs: qgroup: Fix qgroup corruption caused by inode_cache mount option [BUG] The easist way to reproduce the bug is: ------ # mkfs.btrfs -f $dev -n 16K # mount $dev $mnt -o inode_cache # btrfs quota enable $mnt # btrfs quota rescan -w $mnt # btrfs qgroup show $mnt qgroupid rfer excl -------- ---- ---- 0/5 32.00KiB 32.00KiB ^^ Twice the correct value ------ And fstests/btrfs qgroup test group can easily detect them with inode_cache mount option. Although some of them are false alerts since old test cases are using fixed golden output. While new test cases will use "btrfs check" to detect qgroup mismatch. [CAUSE] Inode_cache mount option will make commit_fs_roots() to call btrfs_save_ino_cache() to update fs/subvol trees, and generate new delayed refs. However we call btrfs_qgroup_prepare_account_extents() too early, before commit_fs_roots(). This makes the "old_roots" for newly generated extents are always NULL. For freeing extent case, this makes both new_roots and old_roots to be empty, while correct old_roots should not be empty. This causing qgroup numbers not decreased correctly. [FIX] Modify the timing of calling btrfs_qgroup_prepare_account_extents() to just before btrfs_qgroup_account_extents(), and add needed delayed_refs handler. So qgroup can handle inode_map mount options correctly. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ee5844855c15..2168654c90a1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2128,13 +2128,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* Reocrd old roots for later qgroup accounting */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } - /* * make sure none of the code above managed to slip in a * delayed item @@ -2176,6 +2169,24 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ btrfs_free_log_root_tree(trans, fs_info); + /* + * commit_fs_roots() can call btrfs_save_ino_cache(), which generates + * new delayed refs. Must handle them or qgroup can be wrong. + */ + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); + goto scrub_continue; + } + + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); + goto scrub_continue; + } + /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. -- cgit v1.2.3 From 9986277e0e4ce10fd37bf853fe22ba429a967a45 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Apr 2017 11:57:15 +0300 Subject: Btrfs: handle only applicable errors returned by btrfs_get_extent btrfs_get_extent() never returns NULL pointers, so this code introduces a static checker warning. The btrfs_get_extent() is a bit complex, but trust me that it doesn't return NULLs and also if it did we would trigger the BUG_ON(!em) before the last return statement. Signed-off-by: Dan Carpenter [ updated subject ] Signed-off-by: David Sterba --- fs/btrfs/file.c | 16 ++++------------ fs/btrfs/inode.c | 25 +++++++++++-------------- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 520cb7230b2d..3cedbfd08a3a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2342,13 +2342,8 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) int ret = 0; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); - return ret; - } + if (IS_ERR(em)) + return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->block_start == EXTENT_MAP_HOLE) { @@ -2835,11 +2830,8 @@ static long btrfs_fallocate(struct file *file, int mode, while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); + if (IS_ERR(em)) { + ret = PTR_ERR(em); break; } last_byte = min(extent_map_end(em), alloc_end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da71119b5b7c..31759d48e880 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6736,7 +6736,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, * * This also copies inline extents directly into the page. */ - struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, @@ -7045,19 +7044,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em = btrfs_get_extent(inode, page, pg_offset, start, len, create); if (IS_ERR(em)) return em; - if (em) { - /* - * if our em maps to - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - } + /* + * If our em maps to: + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. + */ + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return em; + else + hole_em = em; /* check to see if we've wrapped (len == -1 or similar) */ end = start + len; -- cgit v1.2.3 From fa7aede2ab5f54352c7aec056930dd17b28f3a78 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Fri, 7 Apr 2017 17:57:05 -0700 Subject: btrfs: Use ktime_get_real_ts for root ctime btrfs_root_item maintains the ctime for root updates. This is not part of vfs_inode. Since current_time() uses struct inode* as an argument as Linus suggested, this cannot be used to update root times unless, we modify the signature to use inode. Since btrfs uses nanosecond time granularity, it can also use ktime_get_real_ts directly to obtain timestamp for the root. It is necessary to use the timespec time api here because the same btrfs_set_stack_timespec_*() apis are used for vfs inode times as well. These can be transitioned to using timespec64 when btrfs internally changes to use timespec64 as well. Signed-off-by: Deepa Dinamani Acked-by: David Sterba Reviewed-by: Arnd Bergmann Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a08224eab8b4..7d6bc308bf43 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -501,8 +501,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct = current_fs_time(root->fs_info->sb); + struct timespec ct; + ktime_get_real_ts(&ct); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); -- cgit v1.2.3 From 0966a7b1300f953b04b436aa82486d3d1b17c96d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Apr 2017 08:35:54 +0800 Subject: btrfs: scrub: Introduce full stripe lock for RAID56 Unlike mirror based profiles, RAID5/6 recovery needs to read out the whole full stripe. And if we don't do proper protection, it can easily cause race condition. Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe() for RAID5/6. Which store a rb_tree of mutexes for full stripes, so scrub callers can use them to lock a full stripe to avoid race. Signed-off-by: Qu Wenruo Reviewed-by: Liu Bo Reviewed-by: David Sterba [ minor comment adjustments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 17 ++++ fs/btrfs/extent-tree.c | 11 +++ fs/btrfs/scrub.c | 223 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 70631d773669..1e82516fe2d8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -539,6 +539,14 @@ struct btrfs_io_ctl { unsigned check_crcs:1; }; +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -649,6 +657,9 @@ struct btrfs_block_group_cache { * Protected by free_space_lock. */ int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; /* delayed seq elem */ @@ -3653,6 +3664,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); +static inline void btrfs_init_full_stripe_locks_tree( + struct btrfs_full_stripe_locks_tree *locks_root) +{ + locks_root->root = RB_ROOT; + mutex_init(&locks_root->lock); +} /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e870e60e33bc..e390451c72e6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); kfree(cache); } @@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); return cache; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4786eff53011..34160d350c77 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -240,6 +240,13 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct full_stripe_lock { + struct rb_node node; + u64 logical; + u64 refs; + struct mutex mutex; +}; + static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); @@ -348,6 +355,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } +/* + * Insert new full stripe lock into full stripe locks tree + * + * Return pointer to existing or newly inserted full_stripe_lock structure if + * everything works well. + * Return ERR_PTR(-ENOMEM) if we failed to allocate memory + * + * NOTE: caller must hold full_stripe_locks_root->lock before calling this + * function + */ +static struct full_stripe_lock *insert_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct full_stripe_lock *entry; + struct full_stripe_lock *ret; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + p = &locks_root->root.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) { + p = &(*p)->rb_left; + } else if (fstripe_logical > entry->logical) { + p = &(*p)->rb_right; + } else { + entry->refs++; + return entry; + } + } + + /* Insert new lock */ + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + ret->logical = fstripe_logical; + ret->refs = 1; + mutex_init(&ret->mutex); + + rb_link_node(&ret->node, parent, p); + rb_insert_color(&ret->node, &locks_root->root); + return ret; +} + +/* + * Search for a full stripe lock of a block group + * + * Return pointer to existing full stripe lock if found + * Return NULL if not found + */ +static struct full_stripe_lock *search_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node *node; + struct full_stripe_lock *entry; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + node = locks_root->root.rb_node; + while (node) { + entry = rb_entry(node, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) + node = node->rb_left; + else if (fstripe_logical > entry->logical) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * Helper to get full stripe logical from a normal bytenr. + * + * Caller must ensure @cache is a RAID56 block group. + */ +static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, + u64 bytenr) +{ + u64 ret; + + /* + * Due to chunk item size limit, full stripe length should not be + * larger than U32_MAX. Just a sanity check here. + */ + WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); + + /* + * round_down() can only handle power of 2, while RAID56 full + * stripe length can be 64KiB * n, so we need to manually round down. + */ + ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * + cache->full_stripe_len + cache->key.objectid; + return ret; +} + +/* + * Lock a full stripe to avoid concurrency of recovery and read + * + * It's only used for profiles with parities (RAID5/6), for other profiles it + * does nothing. + * + * Return 0 if we locked full stripe covering @bytenr, with a mutex held. + * So caller must call unlock_full_stripe() at the same context. + * + * Return <0 if encounters error. + */ +static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool *locked_ret) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *existing; + u64 fstripe_start; + int ret = 0; + + *locked_ret = false; + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + + /* Profiles not based on parity don't need full stripe lock */ + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + locks_root = &bg_cache->full_stripe_locks_root; + + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + /* Now insert the full stripe lock */ + mutex_lock(&locks_root->lock); + existing = insert_full_stripe_lock(locks_root, fstripe_start); + mutex_unlock(&locks_root->lock); + if (IS_ERR(existing)) { + ret = PTR_ERR(existing); + goto out; + } + mutex_lock(&existing->mutex); + *locked_ret = true; +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* + * Unlock a full stripe. + * + * NOTE: Caller must ensure it's the same context calling corresponding + * lock_full_stripe(). + * + * Return 0 if we unlock full stripe without problem. + * Return <0 for error + */ +static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool locked) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *fstripe_lock; + u64 fstripe_start; + bool freeit = false; + int ret = 0; + + /* If we didn't acquire full stripe lock, no need to continue */ + if (!locked) + return 0; + + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + + locks_root = &bg_cache->full_stripe_locks_root; + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + mutex_lock(&locks_root->lock); + fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); + /* Unpaired unlock_full_stripe() detected */ + if (!fstripe_lock) { + WARN_ON(1); + ret = -ENOENT; + mutex_unlock(&locks_root->lock); + goto out; + } + + if (fstripe_lock->refs == 0) { + WARN_ON(1); + btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", + fstripe_lock->logical); + } else { + fstripe_lock->refs--; + } + + if (fstripe_lock->refs == 0) { + rb_erase(&fstripe_lock->node, &locks_root->root); + freeit = true; + } + mutex_unlock(&locks_root->lock); + + mutex_unlock(&fstripe_lock->mutex); + if (freeit) + kfree(fstripe_lock); +out: + btrfs_put_block_group(bg_cache); + return ret; +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) -- cgit v1.2.3 From 28d70e237dac905cd8d1896af10216b7d2bced24 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Apr 2017 08:35:55 +0800 Subject: btrfs: scrub: Fix RAID56 recovery race condition When scrubbing a RAID5 which has recoverable data corruption (only one data stripe is corrupted), sometimes scrub will report more csum errors than expected. Sometimes even unrecoverable error will be reported. The problem can be easily reproduced by the following steps: 1) Create a btrfs with RAID5 data profile with 3 devs 2) Mount it with nospace_cache or space_cache=v2 To avoid extra data space usage. 3) Create a 128K file and sync the fs, unmount it Now the 128K file lies at the beginning of the data chunk 4) Locate the physical bytenr of data chunk on dev3 Dev3 is the 1st data stripe. 5) Corrupt the first 64K of the data chunk stripe on dev3 6) Mount the fs and scrub it The correct csum error number should be 16 (assuming using x86_64). Larger csum error number can be reported in a 1/3 chance. And unrecoverable error can also be reported in a 1/10 chance. The root cause of the problem is RAID5/6 recover code has race condition, due to the fact that full scrub is initiated per device. While for other mirror based profiles, each mirror is independent with each other, so race won't cause any big problem. For example: Corrupted | Correct | Correct | | Scrub dev3 (D1) | Scrub dev2 (D2) | Scrub dev1(P) | ------------------------------------------------------------------------ Read out D1 |Read out D2 |Read full stripe | Check csum |Check csum |Check parity | Csum mismatch |Csum match, continue |Parity mismatch | handle_errored_block | |handle_errored_block | Read out full stripe | | Read out full stripe| D1 csum error(err++) | | D1 csum error(err++)| Recover D1 | | Recover D1 | So D1's csum error is accounted twice, just because handle_errored_block() doesn't have enough protection, and race can happen. On even worse case, for example D1's recovery code is re-writing D1/D2/P, and P's recovery code is just reading out full stripe, then we can cause unrecoverable error. This patch will use previously introduced lock_full_stripe() and unlock_full_stripe() to protect the whole scrub_handle_errored_block() function for RAID56 recovery. So no extra csum error nor unrecoverable error. Reported-by: Goffredo Baroncelli Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 34160d350c77..c7b45eb2403d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1117,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int mirror_index; int page_num; int success; + bool full_stripe_locked; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1142,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + /* + * For RAID5/6, race can happen for a different device scrub thread. + * For data corruption, Parity and Data threads will both try + * to recovery the data. + * Race can lead to doubly added csum error, or even unrecoverable + * error. + */ + ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); + if (ret < 0) { + spin_lock(&sctx->stat_lock); + if (ret == -ENOMEM) + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + return ret; + } + if (sctx->is_dev_replace && !is_metadata && !have_csum) { sblocks_for_recheck = NULL; goto nodatasum_case; @@ -1476,6 +1495,9 @@ out: kfree(sblocks_for_recheck); } + ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + if (ret < 0) + return ret; return 0; } -- cgit v1.2.3 From 13e88e1560d6014838e2dd9f8b9cf8ec9a8d86e6 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 6 Apr 2017 11:22:52 +0800 Subject: btrfs: delete unused member nobarriers The last consumer of nobarriers is removed by the commit [1] and sync won't fail with EOPNOTSUPP anymore. Thus, now when write cache is write through it just return success without actually transpiring such a request to the block device/lun. [1] commit b25de9d6da49b1a8760a89672283128aa8c78345 block: remove BIO_EOPNOTSUPP And, as the device/lun write cache state may change dynamically saving such as state won't help either. So deleting the member nobarriers. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 --- fs/btrfs/volumes.h | 1 - 2 files changed, 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9314fe494c9..e070e463ad8b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3521,9 +3521,6 @@ static int write_dev_flush(struct btrfs_device *device, int wait) struct bio *bio; int ret = 0; - if (device->nobarriers) - return 0; - if (wait) { bio = device->flush_bio; if (!bio) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index def39b5a8d9e..c7d0fbc915ca 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -123,7 +123,6 @@ struct btrfs_device { struct list_head resized_list; /* for sending down flush barriers */ - int nobarriers; struct bio *flush_bio; struct completion flush_wait; -- cgit v1.2.3 From c2a9c7ab475bc3aaf06521a39ac65bc48c8cad4f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 6 Apr 2017 11:22:53 +0800 Subject: btrfs: check if the device is flush capable The block layer call chain from submit_bio will check if the write cache is enabled for the given queue before submitting the flush. This will add a code to fail fast if its not. Signed-off-by: Anand Jain Reviewed-by: David Sterba [ updated changelog to reflect current code stat, blkdev_issue_flush is not used yet ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e070e463ad8b..ba7bd65693a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3518,9 +3518,13 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static int write_dev_flush(struct btrfs_device *device, int wait) { + struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio; int ret = 0; + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return 0; + if (wait) { bio = device->flush_bio; if (!bio) -- cgit v1.2.3 From 4dbd80fb9176f23c78cecd0a8285001cd2066425 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Mar 2017 10:25:51 +0800 Subject: btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error [BUG] When btrfs_reloc_clone_csum() reports error, it can underflow metadata and leads to kernel assertion on outstanding extents in run_delalloc_nocow() and cow_file_range(). BTRFS info (device vdb5): relocating block group 12582912 flags data BTRFS info (device vdb5): found 1 extents assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858 Currently, due to another bug blocking ordered extents, the bug is only reproducible under certain block group layout and using error injection. a) Create one data block group with one 4K extent in it. To avoid the bug that hangs btrfs due to ordered extent which never finishes b) Make btrfs_reloc_clone_csum() always fail c) Relocate that block group [CAUSE] run_delalloc_nocow() and cow_file_range() handles error from btrfs_reloc_clone_csum() wrongly: (The ascii chart shows a more generic case of this bug other than the bug mentioned above) |<------------------ delalloc range --------------------------->| | OE 1 | OE 2 | ... | OE n | |<----------- cleanup range --------------->| |<----------- ----------->| \/ btrfs_finish_ordered_io() range So error handler, which calls extent_clear_unlock_delalloc() with EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io() will both cover OE n, and free its metadata, causing metadata under flow. [Fix] The fix is to ensure after calling btrfs_add_ordered_extent(), we only call error handler after increasing the iteration offset, so that cleanup range won't cover any created ordered extent. |<------------------ delalloc range --------------------------->| | OE 1 | OE 2 | ... | OE n | |<----------- ----------->|<---------- cleanup range --------->| \/ btrfs_finish_ordered_io() range Signed-off-by: Qu Wenruo Reviewed-by: Filipe Manana Reviewed-by: Liu Bo --- fs/btrfs/inode.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55ed2c4829a8..844bb896f5ac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -998,15 +998,24 @@ static noinline int cow_file_range(struct inode *inode, BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() + * at out_unlock label to free meta of this ordered + * extent, as its meta should be freed by + * btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ if (ret) - goto out_drop_extent_cache; + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + ram_size - 1, 0); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (disk_num_bytes < cur_alloc_size) - break; - /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits @@ -1022,10 +1031,21 @@ static noinline int cow_file_range(struct inode *inode, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, op); - disk_num_bytes -= cur_alloc_size; + if (disk_num_bytes < cur_alloc_size) + disk_num_bytes = 0; + else + disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; + + /* + * btrfs_reloc_clone_csums() error, since start is increased + * extent_clear_unlock_delalloc() at out_unlock label won't + * free metadata of current ordered extent, we're OK to exit. + */ + if (ret) + goto out_unlock; } out: return ret; @@ -1414,15 +1434,14 @@ out_check: BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + BTRFS_DATA_RELOC_TREE_OBJECTID) + /* + * Error handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler + * from freeing metadata of created ordered extent. + */ ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) { - if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); - goto error; - } - } extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, end, @@ -1434,6 +1453,14 @@ out_check: if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error + * handler, as metadata for created ordered extent will only + * be freed by btrfs_finish_ordered_io(). + */ + if (ret) + goto error; if (cur_offset > end) break; } -- cgit v1.2.3 From 524272607e882d04e6d1a70d41fcbed819445ab9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Mar 2017 10:25:52 +0800 Subject: btrfs: Handle delalloc error correctly to avoid ordered extent hang [BUG] If run_delalloc_range() returns error and there is already some ordered extents created, btrfs will be hanged with the following backtrace: Call Trace: __schedule+0x2d4/0xae0 schedule+0x3d/0x90 btrfs_start_ordered_extent+0x160/0x200 [btrfs] ? wake_atomic_t_function+0x60/0x60 btrfs_run_ordered_extent_work+0x25/0x40 [btrfs] btrfs_scrubparity_helper+0x1c1/0x620 [btrfs] btrfs_flush_delalloc_helper+0xe/0x10 [btrfs] process_one_work+0x2af/0x720 ? process_one_work+0x22b/0x720 worker_thread+0x4b/0x4f0 kthread+0x10f/0x150 ? process_one_work+0x720/0x720 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x2e/0x40 [CAUSE] |<------------------ delalloc range --------------------------->| | OE 1 | OE 2 | ... | OE n | |<>| |<---------- cleanup range --------->| || \_=> First page handled by end_extent_writepage() in __extent_writepage() The problem is caused by error handler of run_delalloc_range(), which doesn't handle any created ordered extents, leaving them waiting on btrfs_finish_ordered_io() to finish. However after run_delalloc_range() returns error, __extent_writepage() won't submit bio, so btrfs_writepage_end_io_hook() won't be triggered except the first page, and btrfs_finish_ordered_io() won't be triggered for created ordered extents either. So OE 2~n will hang forever, and if OE 1 is larger than one page, it will also hang. [FIX] Introduce btrfs_cleanup_ordered_extents() function to cleanup created ordered extents and finish them manually. The function is based on existing btrfs_endio_direct_write_update_ordered() function, and modify it to act just like btrfs_writepage_endio_hook() but handles specified range other than one page. After fix, delalloc error will be handled like: |<------------------ delalloc range --------------------------->| | OE 1 | OE 2 | ... | OE n | |<>|<-------- ----------->|<------ old error handler --------->| || || || \_=> Cleaned up by cleanup_ordered_extents() \_=> First page handled by end_extent_writepage() in __extent_writepage() Signed-off-by: Qu Wenruo Reviewed-by: Filipe Manana Reviewed-by: Liu Bo Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 63 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 844bb896f5ac..b8b2a3cbdbe1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -115,6 +115,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 ram_bytes, int compress_type, int type); +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate); + +/* + * Cleanup all submitted ordered extents in specified range to handle errors + * from the fill_dellaloc() callback. + * + * NOTE: caller must ensure that when an error happens, it can not call + * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING + * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata + * to be released, which we want to happen only when finishing the ordered + * extent (btrfs_finish_ordered_io()). Also note that the caller of the + * fill_delalloc() callback already does proper cleanup for the first page of + * the range, that is, it invokes the callback writepage_end_io_hook() for the + * range of the first page. + */ +static inline void btrfs_cleanup_ordered_extents(struct inode *inode, + const u64 offset, + const u64 bytes) +{ + return __endio_write_update_ordered(inode, offset + PAGE_SIZE, + bytes - PAGE_SIZE, false); +} + static int btrfs_dirty_inode(struct inode *inode); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -1536,6 +1561,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); } + if (ret) + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -8154,17 +8181,26 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write_update_ordered(struct inode *inode, - const u64 offset, - const u64 bytes, - const int uptodate) +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; int ret; + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -8173,9 +8209,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, btrfs_endio_write_helper, - finish_ordered_fn, NULL, NULL); - btrfs_queue_work(fs_info->endio_write_workers, &ordered->work); + btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -8193,10 +8228,8 @@ static void btrfs_endio_direct_write(struct bio *bio) struct btrfs_dio_private *dip = bio->bi_private; struct bio *dio_bio = dip->dio_bio; - btrfs_endio_direct_write_update_ordered(dip->inode, - dip->logical_offset, - dip->bytes, - !bio->bi_error); + __endio_write_update_ordered(dip->inode, dip->logical_offset, + dip->bytes, !bio->bi_error); kfree(dip); @@ -8557,10 +8590,10 @@ free_ordered: io_bio = NULL; } else { if (write) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, file_offset, dio_bio->bi_iter.bi_size, - 0); + false); else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); @@ -8695,11 +8728,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (dio_data.unsubmitted_oe_range_start < dio_data.unsubmitted_oe_range_end) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, dio_data.unsubmitted_oe_range_start, dio_data.unsubmitted_oe_range_end - dio_data.unsubmitted_oe_range_start, - 0); + false); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); -- cgit v1.2.3 From a315e68f6e8b3006c29482dbfc4d928f098c449c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Mar 2017 23:04:20 +0000 Subject: Btrfs: fix invalid attempt to free reserved space on failure to cow range When attempting to COW a file range (we are starting writeback and doing COW), if we manage to reserve an extent for the range we will write into but fail after reserving it and before creating the respective ordered extent, we end up in an error path where we attempt to decrement the data space's bytes_may_use counter after we already did it while reserving the extent, leading to a warning/trace like the following: [ 847.621524] ------------[ cut here ]------------ [ 847.625441] WARNING: CPU: 5 PID: 4905 at fs/btrfs/extent-tree.c:4316 btrfs_free_reserved_data_space_noquota+0x60/0x9f [btrfs] [ 847.633704] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq i2c_piix4 ppdev psmouse tpm_tis serio_raw pcspkr parport_pc tpm_tis_core i2c_core sg [ 847.644616] CPU: 5 PID: 4905 Comm: xfs_io Not tainted 4.10.0-rc8-btrfs-next-37+ #2 [ 847.648601] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [ 847.648601] Call Trace: [ 847.648601] dump_stack+0x67/0x90 [ 847.648601] __warn+0xc2/0xdd [ 847.648601] warn_slowpath_null+0x1d/0x1f [ 847.648601] btrfs_free_reserved_data_space_noquota+0x60/0x9f [btrfs] [ 847.648601] btrfs_clear_bit_hook+0x140/0x258 [btrfs] [ 847.648601] clear_state_bit+0x87/0x128 [btrfs] [ 847.648601] __clear_extent_bit+0x222/0x2b7 [btrfs] [ 847.648601] clear_extent_bit+0x17/0x19 [btrfs] [ 847.648601] extent_clear_unlock_delalloc+0x3b/0x6b [btrfs] [ 847.648601] cow_file_range.isra.39+0x387/0x39a [btrfs] [ 847.648601] run_delalloc_nocow+0x4d7/0x70e [btrfs] [ 847.648601] ? arch_local_irq_save+0x9/0xc [ 847.648601] run_delalloc_range+0xa7/0x2b5 [btrfs] [ 847.648601] writepage_delalloc.isra.31+0xb9/0x15c [btrfs] [ 847.648601] __extent_writepage+0x249/0x2e8 [btrfs] [ 847.648601] extent_write_cache_pages.constprop.33+0x28b/0x36c [btrfs] [ 847.648601] ? arch_local_irq_save+0x9/0xc [ 847.648601] ? mark_lock+0x24/0x201 [ 847.648601] extent_writepages+0x4b/0x5c [btrfs] [ 847.648601] ? btrfs_writepage_start_hook+0xed/0xed [btrfs] [ 847.648601] btrfs_writepages+0x28/0x2a [btrfs] [ 847.648601] do_writepages+0x23/0x2c [ 847.648601] __filemap_fdatawrite_range+0x5a/0x61 [ 847.648601] filemap_fdatawrite_range+0x13/0x15 [ 847.648601] btrfs_fdatawrite_range+0x20/0x46 [btrfs] [ 847.648601] start_ordered_ops+0x19/0x23 [btrfs] [ 847.648601] btrfs_sync_file+0x136/0x42c [btrfs] [ 847.648601] vfs_fsync_range+0x8c/0x9e [ 847.648601] vfs_fsync+0x1c/0x1e [ 847.648601] do_fsync+0x31/0x4a [ 847.648601] SyS_fsync+0x10/0x14 [ 847.648601] entry_SYSCALL_64_fastpath+0x18/0xad [ 847.648601] RIP: 0033:0x7f5b05200800 [ 847.648601] RSP: 002b:00007ffe204f71c8 EFLAGS: 00000246 ORIG_RAX: 000000000000004a [ 847.648601] RAX: ffffffffffffffda RBX: ffffffff8109637b RCX: 00007f5b05200800 [ 847.648601] RDX: 00000000008bd0a0 RSI: 00000000008bd2e0 RDI: 0000000000000003 [ 847.648601] RBP: ffffc90001d67f98 R08: 000000000000ffff R09: 000000000000001f [ 847.648601] R10: 00000000000001f6 R11: 0000000000000246 R12: 0000000000000046 [ 847.648601] R13: ffffc90001d67f78 R14: 00007f5b054be740 R15: 00007f5b054be740 [ 847.648601] ? trace_hardirqs_off_caller+0x3f/0xaa [ 847.685787] ---[ end trace 2a4a3e15382508e8 ]--- So fix this by not attempting to decrement the data space info's bytes_may_use counter if we already reserved the extent and an error happened before creating the ordered extent. We are already correctly freeing the reserved extent if an error happens, so there's no additional measure needed. Signed-off-by: Filipe Manana Reviewed-by: Liu Bo --- fs/btrfs/extent_io.h | 4 +++- fs/btrfs/inode.c | 59 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3e4fad4a909d..48a30d0e71fb 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -14,7 +14,7 @@ #define EXTENT_DEFRAG (1U << 6) #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) -#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_CLEAR_META_RESV (1U << 11) #define EXTENT_FIRST_DELALLOC (1U << 12) #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) @@ -22,6 +22,8 @@ #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8b2a3cbdbe1..bfe04afa6277 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -943,10 +943,13 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 disk_num_bytes; - u64 cur_alloc_size; + u64 cur_alloc_size = 0; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; + unsigned clear_bits; + unsigned long page_ops; + bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -991,14 +994,14 @@ static noinline int cow_file_range(struct inode *inode, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { - unsigned long op; - cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; + cur_alloc_size = ins.offset; + extent_reserved = true; ram_size = ins.offset; em = create_io_em(inode, start, ins.offset, /* len */ @@ -1013,7 +1016,6 @@ static noinline int cow_file_range(struct inode *inode, goto out_reserve; free_extent_map(em); - cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) @@ -1048,14 +1050,14 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? PAGE_UNLOCK : 0; - op |= PAGE_SET_PRIVATE2; + page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops |= PAGE_SET_PRIVATE2; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, - op); + page_ops); if (disk_num_bytes < cur_alloc_size) disk_num_bytes = 0; else @@ -1063,6 +1065,7 @@ static noinline int cow_file_range(struct inode *inode, num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; + extent_reserved = false; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1081,12 +1084,35 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | + EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK; + /* + * If we reserved an extent for our delalloc range (or a subrange) and + * failed to create the respective ordered extent, then it means that + * when we reserved the extent we decremented the extent's size from + * the data space_info's bytes_may_use counter and incremented the + * space_info's bytes_reserved counter by the same amount. We must make + * sure extent_clear_unlock_delalloc() does not try to decrement again + * the data space_info's bytes_may_use counter, therefore we do not pass + * it the flag EXTENT_CLEAR_DATA_RESV. + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, + start + cur_alloc_size, + start + cur_alloc_size, + locked_page, + clear_bits, + page_ops); + start += cur_alloc_size; + if (start >= end) + goto out; + } extent_clear_unlock_delalloc(inode, start, end, delalloc_end, locked_page, - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DELALLOC | EXTENT_DEFRAG, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + clear_bits | EXTENT_CLEAR_DATA_RESV, + page_ops); goto out; } @@ -1776,7 +1802,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { spin_lock(&inode->lock); inode->outstanding_extents -= num_extents; spin_unlock(&inode->lock); @@ -1787,7 +1813,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, * don't need to call dellalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_DO_ACCOUNTING && + if (*bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); @@ -1795,10 +1821,9 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE) - && (*bits & (EXTENT_DO_ACCOUNTING | - EXTENT_CLEAR_DATA_RESV))) + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + do_list && !(state->state & EXTENT_NORESERVE) && + (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota( &inode->vfs_inode, state->start, len); -- cgit v1.2.3 From 1c81ba237bcecad9bc885a1ddcf02d725ea38482 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 8 Mar 2017 16:43:49 +0000 Subject: Btrfs: fix incorrect space accounting after failure to insert inline extent When using compression, if we fail to insert an inline extent we incorrectly end up attempting to free the reserved data space twice, once through extent_clear_unlock_delalloc(), because we pass it the flag EXTENT_DO_ACCOUNTING, and once through a direct call to btrfs_free_reserved_data_space_noquota(). This results in a trace like the following: [ 834.576240] ------------[ cut here ]------------ [ 834.576825] WARNING: CPU: 2 PID: 486 at fs/btrfs/extent-tree.c:4316 btrfs_free_reserved_data_space_noquota+0x60/0x9f [btrfs] [ 834.579501] Modules linked in: btrfs crc32c_generic xor raid6_pq ppdev i2c_piix4 acpi_cpufreq psmouse tpm_tis parport_pc pcspkr serio_raw tpm_tis_core sg parport evdev i2c_core tpm button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix virtio_pci libata virtio_ring virtio scsi_mod e1000 floppy [last unloaded: btrfs] [ 834.592116] CPU: 2 PID: 486 Comm: kworker/u32:4 Not tainted 4.10.0-rc8-btrfs-next-37+ #2 [ 834.593316] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [ 834.595273] Workqueue: btrfs-delalloc btrfs_delalloc_helper [btrfs] [ 834.596103] Call Trace: [ 834.596103] dump_stack+0x67/0x90 [ 834.596103] __warn+0xc2/0xdd [ 834.596103] warn_slowpath_null+0x1d/0x1f [ 834.596103] btrfs_free_reserved_data_space_noquota+0x60/0x9f [btrfs] [ 834.596103] compress_file_range.constprop.42+0x2fa/0x3fc [btrfs] [ 834.596103] ? submit_compressed_extents+0x3a7/0x3a7 [btrfs] [ 834.596103] async_cow_start+0x32/0x4d [btrfs] [ 834.596103] btrfs_scrubparity_helper+0x187/0x3e7 [btrfs] [ 834.596103] btrfs_delalloc_helper+0xe/0x10 [btrfs] [ 834.596103] process_one_work+0x273/0x4e4 [ 834.596103] worker_thread+0x1eb/0x2ca [ 834.596103] ? rescuer_thread+0x2b6/0x2b6 [ 834.596103] kthread+0x100/0x108 [ 834.596103] ? __list_del_entry+0x22/0x22 [ 834.596103] ret_from_fork+0x2e/0x40 [ 834.611656] ---[ end trace 719902fe6bdef08f ]--- So fix this by not calling directly btrfs_free_reserved_data_space_noquota() if an error happened. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bfe04afa6277..707fccc5279b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -590,8 +590,10 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); + if (ret == 0) + btrfs_free_reserved_data_space_noquota(inode, + start, + end - start + 1); goto free_pages_out; } } -- cgit v1.2.3 From be2d253cc98244765323a7c94cc1ac5cd5a17072 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 3 Apr 2017 15:57:17 +0100 Subject: Btrfs: fix extent map leak during fallocate error path If the call to btrfs_qgroup_reserve_data() failed, we were leaking an extent map structure. The failure can happen either due to an -ENOMEM condition or, when quotas are enabled, due to -EDQUOT for example. Signed-off-by: Filipe Manana Reviewed-by: David Sterba --- fs/btrfs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 48dfb8e4baf2..56304c400db5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2856,8 +2856,10 @@ static long btrfs_fallocate(struct file *file, int mode, } ret = btrfs_qgroup_reserve_data(inode, cur_offset, last_byte - cur_offset); - if (ret < 0) + if (ret < 0) { + free_extent_map(em); break; + } } else { /* * Do not need to reserve unwritten extent for this -- cgit v1.2.3 From e1cbfd7bf6dabdac561c75d08357571f44040a45 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 4 Apr 2017 20:31:00 +0100 Subject: Btrfs: send, fix file hole not being preserved due to inline extent Normally we don't have inline extents followed by regular extents, but there's currently at least one harmless case where this happens. For example, when the page size is 4Kb and compression is enabled: $ mkfs.btrfs -f /dev/sdb $ mount -o compress /dev/sdb /mnt $ xfs_io -f -c "pwrite -S 0xaa 0 4K" -c "fsync" /mnt/foobar $ xfs_io -c "pwrite -S 0xbb 8K 4K" -c "fsync" /mnt/foobar In this case we get a compressed inline extent, representing 4Kb of data, followed by a hole extent and then a regular data extent. The inline extent was not expanded/converted to a regular extent exactly because it represents 4Kb of data. This does not cause any apparent problem (such as the issue solved by commit e1699d2d7bf6 ("btrfs: add missing memset while reading compressed inline extents")) except trigger an unexpected case in the incremental send code path that makes us issue an operation to write a hole when it's not needed, resulting in more writes at the receiver and wasting space at the receiver. So teach the incremental send code to deal with this particular case. The issue can be currently triggered by running fstests btrfs/137 with compression enabled (MOUNT_OPTIONS="-o compress" ./check btrfs/137). Signed-off-by: Filipe Manana Reviewed-by: Liu Bo --- fs/btrfs/send.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a60d5bfb8a49..5b40d617bb03 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5184,13 +5184,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { + if (right_type != BTRFS_FILE_EXTENT_REG && + right_type != BTRFS_FILE_EXTENT_INLINE) { ret = 0; goto out; } right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + right_len = btrfs_file_extent_inline_len(eb, slot, ei); + right_len = PAGE_ALIGN(right_len); + } else { + right_len = btrfs_file_extent_num_bytes(eb, ei); + } right_offset = btrfs_file_extent_offset(eb, ei); right_gen = btrfs_file_extent_generation(eb, ei); @@ -5204,6 +5210,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, goto out; } + /* + * We just wanted to see if when we have an inline extent, what + * follows it is a regular extent (wanted to check the above + * condition for inline extents too). This should normally not + * happen but it's possible for example when we have an inline + * compressed extent representing data with a size matching + * the page size (currently the same as sector size). + */ + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + ret = 0; + goto out; + } + left_offset_fixed = left_offset; if (key.offset < ekey->offset) { /* Fix the right offset for 2a and 7. */ -- cgit v1.2.3 From a7e3b975a0f9296162b72ac6ab7fad9631a07630 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 3 Apr 2017 10:45:46 +0100 Subject: Btrfs: fix reported number of inode blocks Currently when there are buffered writes that were not yet flushed and they fall within allocated ranges of the file (that is, not in holes or beyond eof assuming there are no prealloc extents beyond eof), btrfs simply reports an incorrect number of used blocks through the stat(2) system call (or any of its variants), regardless of mount options or inode flags (compress, compress-force, nodatacow). This is because the number of blocks used that is reported is based on the current number of bytes in the vfs inode plus the number of dealloc bytes in the btrfs inode. The later covers bytes that both fall within allocated regions of the file and holes. Example scenarios where the number of reported blocks is wrong while the buffered writes are not flushed: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt/sdc $ xfs_io -f -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo1 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (259.336 MiB/sec and 66390.0415 ops/sec) $ sync $ xfs_io -c "pwrite -S 0xbb 0 64K" /mnt/sdc/foo1 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (192.308 MiB/sec and 49230.7692 ops/sec) # The following should have reported 64K... $ du -h /mnt/sdc/foo1 128K /mnt/sdc/foo1 $ sync # After flushing the buffered write, it now reports the correct value. $ du -h /mnt/sdc/foo1 64K /mnt/sdc/foo1 $ xfs_io -f -c "falloc -k 0 128K" -c "pwrite -S 0xaa 0 64K" /mnt/sdc/foo2 wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0000 sec (520.833 MiB/sec and 133333.3333 ops/sec) $ sync $ xfs_io -c "pwrite -S 0xbb 64K 64K" /mnt/sdc/foo2 wrote 65536/65536 bytes at offset 65536 64 KiB, 16 ops; 0.0000 sec (260.417 MiB/sec and 66666.6667 ops/sec) # The following should have reported 128K... $ du -h /mnt/sdc/foo2 192K /mnt/sdc/foo2 $ sync # After flushing the buffered write, it now reports the correct value. $ du -h /mnt/sdc/foo2 128K /mnt/sdc/foo2 So the number of used file blocks is simply incorrect, unlike in other filesystems such as ext4 and xfs for example, but only while the buffered writes are not flushed. Fix this by tracking the number of delalloc bytes that fall within holes and beyond eof of a file, and use instead this new counter when reporting the number of used blocks for an inode. Another different problem that exists is that the delalloc bytes counter is reset when writeback starts (by clearing the EXTENT_DEALLOC flag from the respective range in the inode's iotree) and the vfs inode's bytes counter is only incremented when writeback finishes (through insert_reserved_file_extent()). Therefore while writeback is ongoing we simply report a wrong number of blocks used by an inode if the write operation covers a range previously unallocated. While this change does not fix this problem, it does minimizes it a lot by shortening that time window, as the new dealloc bytes counter (new_delalloc_bytes) is only decremented when writeback finishes right before updating the vfs inode's bytes counter. Fully fixing this second problem is not trivial and will be addressed later by a different patch. Signed-off-by: Filipe Manana --- fs/btrfs/btrfs_inode.h | 7 ++++++ fs/btrfs/extent_io.h | 1 + fs/btrfs/file.c | 62 ++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/inode.c | 67 ++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 119 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0c6baaba0651..b8622e4d1744 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -124,6 +124,13 @@ struct btrfs_inode { */ u64 delalloc_bytes; + /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes. + */ + u64 new_delalloc_bytes; + /* * total number of bytes pending defrag, used by stat to check whether * it needs COW. diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 48a30d0e71fb..d5ff51b973a4 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -21,6 +21,7 @@ #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) +#define EXTENT_DELALLOC_NEW (1U << 18) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 56304c400db5..f7d022bc7998 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1404,6 +1404,47 @@ fail: } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->vfs_inode.i_size) { + if (start_pos < inode->vfs_inode.i_size || + (inode->flags & BTRFS_INODE_PREALLOC)) { struct btrfs_ordered_extent *ordered; + unsigned int clear_bits; + lock_extent_bits(&inode->io_tree, start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - + ret = btrfs_find_new_delalloc_bytes(inode, start_pos, + last_pos - start_pos + 1, + cached_state); + clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; + if (ret) + clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; clear_extent_bit(&inode->io_tree, start_pos, - last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + last_pos, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, cached_state, GFP_NOFS); + if (ret) + return ret; *lockstart = start_pos; *lockend = last_pos; ret = 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 707fccc5279b..58e5db8ebd5b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -572,7 +572,7 @@ cont: } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; unsigned long page_error_op; clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; @@ -879,6 +879,7 @@ out_free: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | @@ -974,6 +975,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); @@ -1086,8 +1088,8 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | - EXTENT_CLEAR_META_RESV; + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK; /* @@ -1775,6 +1777,14 @@ static void btrfs_set_bit_hook(struct inode *inode, btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } + + if (!(state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - + state->start; + spin_unlock(&BTRFS_I(inode)->lock); + } } /* @@ -1840,6 +1850,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, btrfs_del_delalloc_inode(root, inode); spin_unlock(&inode->lock); } + + if ((state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&inode->lock); + ASSERT(inode->new_delalloc_bytes >= len); + inode->new_delalloc_bytes -= len; + spin_unlock(&inode->lock); + } } /* @@ -2872,6 +2890,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->len; bool nolock; bool truncated = false; + bool range_locked = false; + bool clear_new_delalloc_bytes = false; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + clear_new_delalloc_bytes = true; nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); @@ -2920,6 +2945,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + range_locked = true; lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state); @@ -2945,7 +2971,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_unlock; + goto out; } trans->block_rsv = &fs_info->delalloc_block_rsv; @@ -2977,7 +3003,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } add_pending_csums(trans, inode, &ordered_extent->list); @@ -2986,14 +3012,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } ret = 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); out: + if (range_locked || clear_new_delalloc_bytes) { + unsigned int clear_bits = 0; + + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, + clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, &cached_state, GFP_NOFS); + } + if (root != fs_info->tree_root) btrfs_delalloc_release_metadata(BTRFS_I(inode), ordered_extent->len); @@ -8906,6 +8944,7 @@ again: if (!inode_evicting) clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); @@ -8963,8 +9002,8 @@ again: if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); @@ -9335,6 +9374,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; @@ -9400,6 +9440,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); WARN_ON(BTRFS_I(inode)->defrag_bytes); @@ -9523,7 +9564,7 @@ static int btrfs_getattr(struct vfsmount *mnt, stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); - delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; -- cgit v1.2.3 From 9bcaaea7418d09691f1ffab5c49aacafe3eef9d0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 4 May 2017 16:08:08 -0700 Subject: btrfs: fix the gfp_mask for the reada_zones radix tree Commits cc8385b59e17 and 7ef70b4d9987a7 added preallocation for the reada radix trees and also switched them over to GFP_KERNEL for the default gfp mask. Since we're doing radix tree insertions under spinlocks, we need to make sure the mask doesn't allow sleeping. This fix keeps the radix preallocation but switches back to the original gfp_mask. Reported-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/volumes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ba7bd65693a3..683194242deb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2693,7 +2693,7 @@ int open_ctree(struct super_block *sb, fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_KERNEL); + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6cad3233181c..017b67daa3bb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -252,7 +252,7 @@ static struct btrfs_device *__alloc_device(void) atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - INIT_RADIX_TREE(&dev->reada_zones, GFP_KERNEL); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); return dev; -- cgit v1.2.3