summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-11-11 16:54:38 -0600
committerLinus Torvalds <torvalds@linux-foundation.org>2018-11-11 16:54:38 -0600
commit63a42e1a5cb3d01eef2f370c11d8733a32f12f86 (patch)
treee28d678bc58570c67242b770a95e09b433d3ecee /fs/btrfs
parentc140f8b072d16595c83d4d16a05693e72d9b1973 (diff)
parentd6fd0ae25c6495674dc5a41a8d16bc8e0073276d (diff)
downloadlinux-63a42e1a5cb3d01eef2f370c11d8733a32f12f86.tar.gz
linux-63a42e1a5cb3d01eef2f370c11d8733a32f12f86.tar.bz2
linux-63a42e1a5cb3d01eef2f370c11d8733a32f12f86.zip
Merge tag 'for-4.20-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "Several fixes to recent release (4.19, fixes tagged for stable) and other fixes" * tag 'for-4.20-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: Btrfs: fix missing delayed iputs on unmount Btrfs: fix data corruption due to cloning of eof block Btrfs: fix infinite loop on inode eviction after deduplication of eof block Btrfs: fix deadlock on tree root leaf when finding free extent btrfs: avoid link error with CONFIG_NO_AUTO_INLINE btrfs: tree-checker: Fix misleading group system information Btrfs: fix missing data checksums after a ranged fsync (msync) btrfs: fix pinned underflow after transaction aborted Btrfs: fix cur_offset in the error case for nocow
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/disk-io.c63
-rw-r--r--fs/btrfs/free-space-cache.c22
-rw-r--r--fs/btrfs/inode.c37
-rw-r--r--fs/btrfs/ioctl.c14
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/tree-log.c17
8 files changed, 107 insertions, 57 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80953528572d..68f322f600a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3163,6 +3163,9 @@ void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *new,
+ struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ab41da91d1..3f0b6d1936e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
struct btrfs_fs_info *fs_info = root->fs_info;
int again;
- struct btrfs_trans_handle *trans;
- do {
+ while (1) {
again = 0;
/* Make the cleaner go to sleep early. */
@@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(fs_info);
sleep:
+ if (kthread_should_park())
+ kthread_parkme();
+ if (kthread_should_stop())
+ return 0;
if (!again) {
set_current_state(TASK_INTERRUPTIBLE);
- if (!kthread_should_stop())
- schedule();
+ schedule();
__set_current_state(TASK_RUNNING);
}
- } while (!kthread_should_stop());
-
- /*
- * Transaction kthread is stopped before us and wakes us up.
- * However we might have started a new transaction and COWed some
- * tree blocks when deleting unused block groups for example. So
- * make sure we commit the transaction we started to have a clean
- * shutdown when evicting the btree inode - if it has dirty pages
- * when we do the final iput() on it, eviction will trigger a
- * writeback for it which will fail with null pointer dereferences
- * since work queues and other resources were already released and
- * destroyed by the time the iput/eviction/writeback is made.
- */
- trans = btrfs_attach_transaction(root);
- if (IS_ERR(trans)) {
- if (PTR_ERR(trans) != -ENOENT)
- btrfs_err(fs_info,
- "cleaner transaction attach returned %ld",
- PTR_ERR(trans));
- } else {
- int ret;
-
- ret = btrfs_commit_transaction(trans);
- if (ret)
- btrfs_err(fs_info,
- "cleaner open transaction commit returned %d",
- ret);
}
-
- return 0;
}
static int transaction_kthread(void *arg)
@@ -3931,6 +3904,13 @@ void close_ctree(struct btrfs_fs_info *fs_info)
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+ /*
+ * We don't want the cleaner to start new transactions, add more delayed
+ * iputs, etc. while we're closing. We can't use kthread_stop() yet
+ * because that frees the task_struct, and the transaction kthread might
+ * still try to wake up the cleaner.
+ */
+ kthread_park(fs_info->cleaner_kthread);
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -3958,9 +3938,8 @@ void close_ctree(struct btrfs_fs_info *fs_info)
if (!sb_rdonly(fs_info->sb)) {
/*
- * If the cleaner thread is stopped and there are
- * block groups queued for removal, the deletion will be
- * skipped when we quit the cleaner thread.
+ * The cleaner kthread is stopped, so do one final pass over
+ * unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
@@ -4359,13 +4338,23 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
unpin = pinned_extents;
again:
while (1) {
+ /*
+ * The btrfs_finish_extent_commit() may get the same range as
+ * ours between find_first_extent_bit and clear_extent_dirty.
+ * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
+ * the same extent range.
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
- if (ret)
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
+ }
clear_extent_dirty(unpin, start, end);
btrfs_error_unpin_extent_range(fs_info, start, end);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4ba0aedc878b..74aa552f4793 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -75,7 +75,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+ inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
+ btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
@@ -838,6 +839,25 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
path->search_commit_root = 1;
path->skip_locking = 1;
+ /*
+ * We must pass a path with search_commit_root set to btrfs_iget in
+ * order to avoid a deadlock when allocating extents for the tree root.
+ *
+ * When we are COWing an extent buffer from the tree root, when looking
+ * for a free extent, at extent-tree.c:find_free_extent(), we can find
+ * block group without its free space cache loaded. When we find one
+ * we must load its space cache which requires reading its free space
+ * cache's inode item from the root tree. If this inode item is located
+ * in the same leaf that we started COWing before, then we end up in
+ * deadlock on the extent buffer (trying to read lock it when we
+ * previously write locked it).
+ *
+ * It's safe to read the inode item using the commit root because
+ * block groups, once loaded, stay in memory forever (until they are
+ * removed) as well as their space caches once loaded. New block groups
+ * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
+ * we will never try to read their inode item while the fs is mounted.
+ */
inode = lookup_free_space_inode(fs_info, block_group, path);
if (IS_ERR(inode)) {
btrfs_free_path(path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3df5b52278c..9ea4c6f0352f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1531,12 +1531,11 @@ out_check:
}
btrfs_release_path(path);
- if (cur_offset <= end && cow_start == (u64)-1) {
+ if (cur_offset <= end && cow_start == (u64)-1)
cow_start = cur_offset;
- cur_offset = end;
- }
if (cow_start != (u64)-1) {
+ cur_offset = end;
ret = cow_file_range(inode, locked_page, cow_start, end, end,
page_started, nr_written, 1, NULL);
if (ret)
@@ -3570,10 +3569,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
/*
* read an inode from the btree into the in-memory inode
*/
-static int btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode,
+ struct btrfs_path *in_path)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_path *path;
+ struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3589,15 +3589,18 @@ static int btrfs_read_locked_inode(struct inode *inode)
if (!ret)
filled = true;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
- btrfs_free_path(path);
+ if (path != in_path)
+ btrfs_free_path(path);
return ret;
}
@@ -3722,7 +3725,8 @@ cache_acl:
btrfs_ino(BTRFS_I(inode)),
root->root_key.objectid, ret);
}
- btrfs_free_path(path);
+ if (path != in_path)
+ btrfs_free_path(path);
if (!maybe_acls)
cache_no_acl(inode);
@@ -5644,8 +5648,9 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
/* Get an inode object given its location and corresponding root.
* Returns in *is_new if the inode was read from disk
*/
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new)
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *new,
+ struct btrfs_path *path)
{
struct inode *inode;
@@ -5656,7 +5661,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
if (inode->i_state & I_NEW) {
int ret;
- ret = btrfs_read_locked_inode(inode);
+ ret = btrfs_read_locked_inode(inode, path);
if (!ret) {
inode_tree_add(inode);
unlock_new_inode(inode);
@@ -5678,6 +5683,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return inode;
}
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *new)
+{
+ return btrfs_iget_path(s, location, root, new, NULL);
+}
+
static struct inode *new_simple_dir(struct super_block *s,
struct btrfs_key *key,
struct btrfs_root *root)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3ca6943827ef..802a628e9f7d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3488,6 +3488,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
len = round_down(i_size_read(src), sz) - loff;
+ if (len == 0)
+ return 0;
olen = len;
}
}
@@ -4257,9 +4259,17 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
- /* if we extend to eof, continue to block boundary */
- if (off + len == src->i_size)
+ /*
+ * If we extend to eof, continue to block boundary if and only if the
+ * destination end offset matches the destination file's size, otherwise
+ * we would be corrupting data by placing the eof block into the middle
+ * of a file.
+ */
+ if (off + len == src->i_size) {
+ if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
+ goto out_unlock;
len = ALIGN(src->i_size, bs) - off;
+ }
if (len == 0) {
ret = 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b362b45dd757..cbc9d0d2c12d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1916,7 +1916,7 @@ restore:
}
/* Used to sort the devices by max_avail(descending sort) */
-static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
const void *dev_info2)
{
if (((struct btrfs_device_info *)dev_info1)->max_avail >
@@ -1945,8 +1945,8 @@ static inline void btrfs_descending_sort_devices(
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
-static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
- u64 *free_bytes)
+static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+ u64 *free_bytes)
{
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index cab0b1f1f741..efcf89a8ba44 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
type != (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA)) {
block_group_err(fs_info, leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e07f3376b7df..a5ce99a6c936 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4396,6 +4396,23 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
logged_end = end;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+ /*
+ * Skip extents outside our logging range. It's important to do
+ * it for correctness because if we don't ignore them, we may
+ * log them before their ordered extent completes, and therefore
+ * we could log them without logging their respective checksums
+ * (the checksum items are added to the csum tree at the very
+ * end of btrfs_finish_ordered_io()). Also leave such extents
+ * outside of our range in the list, since we may have another
+ * ranged fsync in the near future that needs them. If an extent
+ * outside our range corresponds to a hole, log it to avoid
+ * leaving gaps between extents (fsck will complain when we are
+ * not using the NO_HOLES feature).
+ */
+ if ((em->start > end || em->start + em->len <= start) &&
+ em->block_start != EXTENT_MAP_HOLE)
+ continue;
+
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive