summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c85
-rw-r--r--fs/btrfs/bio.c2
-rw-r--r--fs/btrfs/block-group.c86
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/disk-io.c14
-rw-r--r--fs/btrfs/extent_map.c7
-rw-r--r--fs/btrfs/free-space-cache.c8
-rw-r--r--fs/btrfs/fs.h7
-rw-r--r--fs/btrfs/inode.c7
-rw-r--r--fs/btrfs/ioctl.c3
-rw-r--r--fs/btrfs/qgroup.c11
-rw-r--r--fs/btrfs/space-info.c42
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/sysfs.c42
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/volumes.c23
-rw-r--r--fs/btrfs/zoned.c45
18 files changed, 275 insertions, 130 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 90e40d5ceccd..e54f0884802a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1921,8 +1921,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
level = -1;
ULIST_ITER_INIT(&uiter);
while (1) {
- bool is_shared;
- bool cached;
+ const unsigned long prev_ref_count = ctx->refs.nnodes;
walk_ctx.bytenr = bytenr;
ret = find_parent_nodes(&walk_ctx, &shared);
@@ -1940,21 +1939,36 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
ret = 0;
/*
- * If our data extent was not directly shared (without multiple
- * reference items), than it might have a single reference item
- * with a count > 1 for the same offset, which means there are 2
- * (or more) file extent items that point to the data extent -
- * this happens when a file extent item needs to be split and
- * then one item gets moved to another leaf due to a b+tree leaf
- * split when inserting some item. In this case the file extent
- * items may be located in different leaves and therefore some
- * of the leaves may be referenced through shared subtrees while
- * others are not. Since our extent buffer cache only works for
- * a single path (by far the most common case and simpler to
- * deal with), we can not use it if we have multiple leaves
- * (which implies multiple paths).
+ * More than one extent buffer (bytenr) may have been added to
+ * the ctx->refs ulist, in which case we have to check multiple
+ * tree paths in case the first one is not shared, so we can not
+ * use the path cache which is made for a single path. Multiple
+ * extent buffers at the current level happen when:
+ *
+ * 1) level -1, the data extent: If our data extent was not
+ * directly shared (without multiple reference items), then
+ * it might have a single reference item with a count > 1 for
+ * the same offset, which means there are 2 (or more) file
+ * extent items that point to the data extent - this happens
+ * when a file extent item needs to be split and then one
+ * item gets moved to another leaf due to a b+tree leaf split
+ * when inserting some item. In this case the file extent
+ * items may be located in different leaves and therefore
+ * some of the leaves may be referenced through shared
+ * subtrees while others are not. Since our extent buffer
+ * cache only works for a single path (by far the most common
+ * case and simpler to deal with), we can not use it if we
+ * have multiple leaves (which implies multiple paths).
+ *
+ * 2) level >= 0, a tree node/leaf: We can have a mix of direct
+ * and indirect references on a b+tree node/leaf, so we have
+ * to check multiple paths, and the extent buffer (the
+ * current bytenr) may be shared or not. One example is
+ * during relocation as we may get a shared tree block ref
+ * (direct ref) and a non-shared tree block ref (indirect
+ * ref) for the same node/leaf.
*/
- if (level == -1 && ctx->refs.nnodes > 1)
+ if ((ctx->refs.nnodes - prev_ref_count) > 1)
ctx->use_path_cache = false;
if (level >= 0)
@@ -1964,12 +1978,17 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
if (!node)
break;
bytenr = node->val;
- level++;
- cached = lookup_backref_shared_cache(ctx, root, bytenr, level,
- &is_shared);
- if (cached) {
- ret = (is_shared ? 1 : 0);
- break;
+ if (ctx->use_path_cache) {
+ bool is_shared;
+ bool cached;
+
+ level++;
+ cached = lookup_backref_shared_cache(ctx, root, bytenr,
+ level, &is_shared);
+ if (cached) {
+ ret = (is_shared ? 1 : 0);
+ break;
+ }
}
shared.share_count = 0;
shared.have_delayed_delete_refs = false;
@@ -1977,6 +1996,28 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
}
/*
+ * If the path cache is disabled, then it means at some tree level we
+ * got multiple parents due to a mix of direct and indirect backrefs or
+ * multiple leaves with file extent items pointing to the same data
+ * extent. We have to invalidate the cache and cache only the sharedness
+ * result for the levels where we got only one node/reference.
+ */
+ if (!ctx->use_path_cache) {
+ int i = 0;
+
+ level--;
+ if (ret >= 0 && level >= 0) {
+ bytenr = ctx->path_cache_entries[level].bytenr;
+ ctx->use_path_cache = true;
+ store_backref_shared_cache(ctx, root, bytenr, level, ret);
+ i = level + 1;
+ }
+
+ for ( ; i < BTRFS_MAX_LEVEL; i++)
+ ctx->path_cache_entries[i].bytenr = 0;
+ }
+
+ /*
* Cache the sharedness result for the data extent if we know our inode
* has more than 1 file extent item that refers to the data extent.
*/
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index d8b90f95b157..726592868e9c 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -287,7 +287,7 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- if (!(bio->bi_opf & REQ_RAHEAD))
+ else if (!(bio->bi_opf & REQ_RAHEAD))
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5b10401d803b..5fc670c27f86 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -558,14 +558,15 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
struct btrfs_block_group *block_group,
int index, int max_index,
- struct btrfs_key *key)
+ struct btrfs_key *found_key)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root;
- int ret = 0;
u64 search_offset;
u64 search_end = block_group->start + block_group->length;
struct btrfs_path *path;
+ struct btrfs_key search_key;
+ int ret = 0;
ASSERT(index >= 0);
ASSERT(index <= max_index);
@@ -585,37 +586,24 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
path->reada = READA_FORWARD;
search_offset = index * div_u64(block_group->length, max_index);
- key->objectid = block_group->start + search_offset;
- key->type = BTRFS_EXTENT_ITEM_KEY;
- key->offset = 0;
+ search_key.objectid = block_group->start + search_offset;
+ search_key.type = BTRFS_EXTENT_ITEM_KEY;
+ search_key.offset = 0;
- while (1) {
- ret = btrfs_search_forward(extent_root, key, path, 0);
- if (ret != 0)
- goto out;
+ btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
/* Success; sampled an extent item in the block group */
- if (key->type == BTRFS_EXTENT_ITEM_KEY &&
- key->objectid >= block_group->start &&
- key->objectid + key->offset <= search_end)
- goto out;
+ if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
+ found_key->objectid >= block_group->start &&
+ found_key->objectid + found_key->offset <= search_end)
+ break;
/* We can't possibly find a valid extent item anymore */
- if (key->objectid >= search_end) {
+ if (found_key->objectid >= search_end) {
ret = 1;
break;
}
- if (key->type < BTRFS_EXTENT_ITEM_KEY)
- key->type = BTRFS_EXTENT_ITEM_KEY;
- else
- key->objectid++;
- btrfs_release_path(path);
- up_read(&fs_info->commit_root_sem);
- mutex_unlock(&caching_ctl->mutex);
- cond_resched();
- mutex_lock(&caching_ctl->mutex);
- down_read(&fs_info->commit_root_sem);
}
-out:
+
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -659,6 +647,7 @@ out:
static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
struct btrfs_block_group *block_group)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_key key;
int i;
u64 min_size = block_group->length;
@@ -668,6 +657,8 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl
if (!btrfs_block_group_should_use_size_class(block_group))
return 0;
+ lockdep_assert_held(&caching_ctl->mutex);
+ lockdep_assert_held_read(&fs_info->commit_root_sem);
for (i = 0; i < 5; ++i) {
ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
if (ret < 0)
@@ -682,7 +673,6 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl
block_group->size_class = size_class;
spin_unlock(&block_group->lock);
}
-
out:
return ret;
}
@@ -1185,14 +1175,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
< block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
- WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
- &block_group->runtime_flags) &&
- block_group->space_info->active_total_bytes
- < block_group->length);
}
block_group->space_info->total_bytes -= block_group->length;
- if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
- block_group->space_info->active_total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
block_group->space_info->bytes_zone_unusable -=
@@ -1836,7 +1820,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
btrfs_info(fs_info,
"reclaiming chunk %llu with %llu%% used %llu%% unusable",
- bg->start, div_u64(bg->used * 100, bg->length),
+ bg->start,
+ div64_u64(bg->used * 100, bg->length),
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
@@ -2493,18 +2478,29 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group_item bgi;
struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
+ u64 old_commit_used;
+ int ret;
spin_lock(&block_group->lock);
btrfs_set_stack_block_group_used(&bgi, block_group->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
block_group->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
+ old_commit_used = block_group->commit_used;
+ block_group->commit_used = block_group->used;
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
spin_unlock(&block_group->lock);
- return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ if (ret < 0) {
+ spin_lock(&block_group->lock);
+ block_group->commit_used = old_commit_used;
+ spin_unlock(&block_group->lock);
+ }
+
+ return ret;
}
static int insert_dev_extent(struct btrfs_trans_handle *trans,
@@ -3474,6 +3470,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
+ struct btrfs_space_info *space_info;
bool reclaim = false;
cache = btrfs_lookup_block_group(info, bytenr);
@@ -3481,6 +3478,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
ret = -ENOENT;
break;
}
+ space_info = cache->space_info;
factor = btrfs_bg_type_to_factor(cache->flags);
/*
@@ -3495,7 +3493,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
byte_in_group = bytenr - cache->start;
WARN_ON(byte_in_group > cache->length);
- spin_lock(&cache->space_info->lock);
+ spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (btrfs_test_opt(info, SPACE_CACHE) &&
@@ -3508,24 +3506,24 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
old_val += num_bytes;
cache->used = old_val;
cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
- cache->space_info->bytes_used += num_bytes;
- cache->space_info->disk_used += num_bytes * factor;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_used += num_bytes;
+ space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ spin_unlock(&space_info->lock);
} else {
old_val -= num_bytes;
cache->used = old_val;
cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(info,
- cache->space_info, num_bytes);
- cache->space_info->bytes_used -= num_bytes;
- cache->space_info->disk_used -= num_bytes * factor;
+ btrfs_space_info_update_bytes_pinned(info, space_info,
+ num_bytes);
+ space_info->bytes_used -= num_bytes;
+ space_info->disk_used -= num_bytes * factor;
reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ spin_unlock(&space_info->lock);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0095c6e4c3d1..6b457b010cbc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1048,7 +1048,7 @@ again:
* so there is only one iref. The case that several irefs are
* in the same item doesn't exist.
*/
- btrfs_del_item(trans, root, path);
+ ret = btrfs_del_item(trans, root, path);
out:
btrfs_release_delayed_iref(node);
btrfs_release_path(path);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b53f0e30ce2b..9e1596bb208d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2250,6 +2250,20 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
+ /*
+ * Check if the checksum implementation is a fast accelerated one.
+ * As-is this is a bit of a hack and should be replaced once the csum
+ * implementations provide that information themselves.
+ */
+ switch (csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
+ break;
+ default:
+ break;
+ }
+
btrfs_info(fs_info, "using %s (%s) checksum algorithm",
btrfs_super_csum_name(csum_type),
crypto_shash_driver_name(csum_shash));
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index be94030e1dfb..138afa955370 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -763,7 +763,13 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto next;
}
+ flags = em->flags;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ /*
+ * In case we split the extent map, we want to preserve the
+ * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
+ * it on the new extent maps.
+ */
clear_bit(EXTENT_FLAG_LOGGING, &flags);
modified = !list_empty(&em->list);
@@ -774,7 +780,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
if (em->start >= start && em_end <= end)
goto remove_em;
- flags = em->flags;
gen = em->generation;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0d250d052487..d84cef89cdff 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2693,8 +2693,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
spin_lock(&ctl->tree_lock);
+ /* Count initial region as zone_unusable until it gets activated. */
if (!used)
to_free = size;
+ else if (initial &&
+ test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) &&
+ (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
+ to_free = 0;
else if (initial)
to_free = block_group->zone_capacity;
else if (offset >= block_group->alloc_offset)
@@ -2722,7 +2727,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
reclaimable_unusable = block_group->zone_unusable -
(block_group->length - block_group->zone_capacity);
/* All the region is now unusable. Mark it as unused and reclaim */
- if (block_group->zone_unusable == block_group->length) {
+ if (block_group->zone_unusable == block_group->length &&
+ block_group->alloc_offset) {
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
reclaimable_unusable >=
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 4c477eae6891..24cd49229408 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -120,11 +120,8 @@ enum {
/* Indicate that we want to commit the transaction. */
BTRFS_FS_NEED_TRANS_COMMIT,
- /*
- * Indicate metadata over-commit is disabled. This is set when active
- * zone tracking is needed.
- */
- BTRFS_FS_NO_OVERCOMMIT,
+ /* This is set when active zone tracking is needed. */
+ BTRFS_FS_ACTIVE_ZONE_TRACKING,
/*
* Indicate if we have some features changed, this is mostly for
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6c18dc9a1831..957e4d76a7b6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5421,8 +5421,13 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
return -ENOMEM;
ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
- if (ret)
+ if (ret < 0)
goto out;
+ /*
+ * fscrypt_setup_filename() should never return a positive value, but
+ * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
+ */
+ ASSERT(ret == 0);
/* This needs to handle no-key deletions later on */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 84626c8ad5bf..ba769a1eb87a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2859,6 +2859,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
di_args->bytes_used = btrfs_device_get_bytes_used(dev);
di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+ memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
if (dev->name)
strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));
else
@@ -3731,7 +3732,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
}
/* update qgroup status and info */
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
err = btrfs_run_qgroups(trans);
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (err < 0)
btrfs_handle_fs_error(fs_info, err,
"failed to update qgroup status and info");
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 52a7d2fa2284..f41da7ac360d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2828,13 +2828,22 @@ cleanup:
}
/*
- * called from commit_transaction. Writes all changed qgroups to disk.
+ * Writes all changed qgroups to disk.
+ * Called by the transaction commit path and the qgroup assign ioctl.
*/
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
+ /*
+ * In case we are called from the qgroup assign ioctl, assert that we
+ * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
+ * disable operation (ioctl) and access a freed quota root.
+ */
+ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
+ lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
+
if (!fs_info->quota_root)
return ret;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 69c09508afb5..3eecce86f63f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -308,8 +308,6 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
ASSERT(found);
spin_lock(&found->lock);
found->total_bytes += block_group->length;
- if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
- found->active_total_bytes += block_group->length;
found->disk_total += block_group->length * factor;
found->bytes_used += block_group->used;
found->disk_used += block_group->used * factor;
@@ -379,22 +377,6 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
return avail;
}
-static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
-{
- /*
- * On regular filesystem, all total_bytes are always writable. On zoned
- * filesystem, there may be a limitation imposed by max_active_zones.
- * For metadata allocation, we cannot finish an existing active block
- * group to avoid a deadlock. Thus, we need to consider only the active
- * groups to be writable for metadata space.
- */
- if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
- return space_info->total_bytes;
-
- return space_info->active_total_bytes;
-}
-
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
@@ -407,13 +389,13 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
used = btrfs_space_info_used(space_info, true);
- if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) &&
+ if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) &&
(space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
avail = 0;
else
avail = calc_available_free_space(fs_info, space_info, flush);
- if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
+ if (used + bytes < space_info->total_bytes + avail)
return 1;
return 0;
}
@@ -449,7 +431,7 @@ again:
ticket = list_first_entry(head, struct reserve_ticket, list);
/* Check and see if our ticket can be satisfied now. */
- if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
+ if ((used + ticket->bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
@@ -829,7 +811,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
{
u64 used;
u64 avail;
- u64 total;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
@@ -844,9 +825,8 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
* space. If that's the case add in our overage so we make sure to put
* appropriate pressure on the flushing state machine.
*/
- total = writable_total_bytes(fs_info, space_info);
- if (total + avail < used)
- to_reclaim += used - (total + avail);
+ if (space_info->total_bytes + avail < used)
+ to_reclaim += used - (space_info->total_bytes + avail);
return to_reclaim;
}
@@ -856,11 +836,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
{
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
- u64 total = writable_total_bytes(fs_info, space_info);
u64 thresh;
u64 used;
- thresh = mult_perc(total, 90);
+ thresh = mult_perc(space_info->total_bytes, 90);
lockdep_assert_held(&space_info->lock);
@@ -923,8 +902,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
- if (used < total)
- thresh += total - used;
+ if (used < space_info->total_bytes)
+ thresh += space_info->total_bytes - used;
thresh >>= space_info->clamp;
used = space_info->bytes_pinned;
@@ -1651,7 +1630,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* can_overcommit() to ensure we can overcommit to continue.
*/
if (!pending_tickets &&
- ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
+ ((used + orig_bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
@@ -1665,8 +1644,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
*/
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
used = btrfs_space_info_used(space_info, false);
- if (used + orig_bytes <=
- writable_total_bytes(fs_info, space_info)) {
+ if (used + orig_bytes <= space_info->total_bytes) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
ret = 0;
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index fc99ea2b0c34..2033b71b18ce 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -96,8 +96,6 @@ struct btrfs_space_info {
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
- /* Total bytes in the space, but only accounts active block groups. */
- u64 active_total_bytes;
u64 bytes_zone_unusable; /* total bytes that are unusable until
resetting the device zone */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 581845bc206a..366fb4cde145 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1516,8 +1516,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
- if (!strstr(crc32c_impl(), "generic"))
- set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
@@ -1631,6 +1629,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+ workqueue_set_max_active(fs_info->endio_workers, new_pool_size);
+ workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 8c5efa5813b3..37fc58a7f27e 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -9,6 +9,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/bug.h>
+#include <linux/list.h>
#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
@@ -778,6 +779,45 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
return len;
}
+static ssize_t btrfs_size_classes_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+ struct btrfs_block_group *bg;
+ u32 none = 0;
+ u32 small = 0;
+ u32 medium = 0;
+ u32 large = 0;
+
+ for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) {
+ down_read(&sinfo->groups_sem);
+ list_for_each_entry(bg, &sinfo->block_groups[i], list) {
+ if (!btrfs_block_group_should_use_size_class(bg))
+ continue;
+ switch (bg->size_class) {
+ case BTRFS_BG_SZ_NONE:
+ none++;
+ break;
+ case BTRFS_BG_SZ_SMALL:
+ small++;
+ break;
+ case BTRFS_BG_SZ_MEDIUM:
+ medium++;
+ break;
+ case BTRFS_BG_SZ_LARGE:
+ large++;
+ break;
+ }
+ }
+ up_read(&sinfo->groups_sem);
+ }
+ return sysfs_emit(buf, "none %u\n"
+ "small %u\n"
+ "medium %u\n"
+ "large %u\n",
+ none, small, medium, large);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
/*
* Request chunk allocation with current chunk size.
@@ -835,6 +875,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
+BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -887,6 +928,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
BTRFS_ATTR_PTR(space_info, chunk_size),
+ BTRFS_ATTR_PTR(space_info, size_classes),
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 18329ebcb1cb..b8d5b1fa9a03 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2035,7 +2035,20 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
if (current->journal_info == trans)
current->journal_info = NULL;
- btrfs_scrub_cancel(fs_info);
+
+ /*
+ * If relocation is running, we can't cancel scrub because that will
+ * result in a deadlock. Before relocating a block group, relocation
+ * pauses scrub, then starts and commits a transaction before unpausing
+ * scrub. If the transaction commit is being done by the relocation
+ * task or triggered by another task and the relocation task is waiting
+ * for the commit, and we end up here due to an error in the commit
+ * path, then calling btrfs_scrub_cancel() will deadlock, as we are
+ * asking for scrub to stop while having it asked to be paused higher
+ * above in relocation code.
+ */
+ if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ btrfs_scrub_cancel(fs_info);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7823168c08a6..c6d592870400 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1366,8 +1366,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
- flags |= FMODE_EXCL;
+ /*
+ * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
+ * initiate the device scan which may race with the user's mount
+ * or mkfs command, resulting in failure.
+ * Since the device scan is solely for reading purposes, there is
+ * no need for FMODE_EXCL. Additionally, the devices are read again
+ * during the mount process. It is ok to get some inconsistent
+ * values temporarily, as the device paths of the fsid are the only
+ * required information for assembling the volume.
+ */
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
@@ -3266,8 +3275,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
btrfs_scrub_pause(fs_info);
ret = btrfs_relocate_block_group(fs_info, chunk_offset);
btrfs_scrub_continue(fs_info);
- if (ret)
+ if (ret) {
+ /*
+ * If we had a transaction abort, stop all running scrubs.
+ * See transaction.c:cleanup_transaction() why we do it here.
+ */
+ if (BTRFS_FS_ERROR(fs_info))
+ btrfs_scrub_cancel(fs_info);
return ret;
+ }
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!block_group)
@@ -6363,7 +6379,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, *length);
- ASSERT(!IS_ERR(em));
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
data_stripes = nr_data_stripes(map);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f95b2c94d619..45d04092f2f8 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -524,8 +524,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
atomic_set(&zone_info->active_zones_left,
max_active_zones - nactive);
- /* Overcommit does not work well with active zone tacking. */
- set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags);
+ set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
}
/* Validate superblock log */
@@ -1581,9 +1580,19 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
return;
WARN_ON(cache->bytes_super != 0);
- unusable = (cache->alloc_offset - cache->used) +
- (cache->length - cache->zone_capacity);
- free = cache->zone_capacity - cache->alloc_offset;
+
+ /* Check for block groups never get activated */
+ if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
+ cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
+ !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
+ cache->alloc_offset == 0) {
+ unusable = cache->length;
+ free = 0;
+ } else {
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
+ }
/* We only need ->free_space in ALLOC_SEQ block groups */
cache->cached = BTRFS_CACHE_FINISHED;
@@ -1902,7 +1911,11 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
- space_info->active_total_bytes += block_group->length;
+ WARN_ON(block_group->alloc_offset != 0);
+ if (block_group->zone_unusable == block_group->length) {
+ block_group->zone_unusable = block_group->length - block_group->zone_capacity;
+ space_info->bytes_zone_unusable -= block_group->zone_capacity;
+ }
spin_unlock(&block_group->lock);
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
@@ -2086,11 +2099,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
if (!device->bdev)
continue;
- if (!zinfo->max_active_zones ||
- atomic_read(&zinfo->active_zones_left)) {
+ if (!zinfo->max_active_zones) {
ret = true;
break;
}
+
+ switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case 0: /* single */
+ ret = (atomic_read(&zinfo->active_zones_left) >= 1);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ ret = (atomic_read(&zinfo->active_zones_left) >= 2);
+ break;
+ }
+ if (ret)
+ break;
}
mutex_unlock(&fs_info->chunk_mutex);
@@ -2256,7 +2279,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
u64 avail;
spin_lock(&block_group->lock);
- if (block_group->reserved ||
+ if (block_group->reserved || block_group->alloc_offset == 0 ||
(block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
spin_unlock(&block_group->lock);
continue;
@@ -2293,10 +2316,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
return 0;
- /* No more block groups to activate */
- if (space_info->active_total_bytes == space_info->total_bytes)
- return 0;
-
for (;;) {
int ret;
bool need_finish = false;