summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c336
1 files changed, 207 insertions, 129 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9879bd474632..f5fbe576d2ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_root *root,
int load_cache_only)
{
+ DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl;
int ret = 0;
- smp_mb();
- if (cache->cached != BTRFS_CACHE_NO)
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ atomic_set(&caching_ctl->count, 1);
+ caching_ctl->work.func = caching_thread;
+
+ spin_lock(&cache->lock);
+ /*
+ * This should be a rare occasion, but this could happen I think in the
+ * case where one thread starts to load the space cache info, and then
+ * some other thread starts a transaction commit which tries to do an
+ * allocation while the other thread is still loading the space cache
+ * info. The previous loop should have kept us from choosing this block
+ * group, but if we've moved to the state where we will wait on caching
+ * block groups we need to first check if we're doing a fast load here,
+ * so we can wait for it to finish, otherwise we could end up allocating
+ * from a block group who's cache gets evicted for one reason or
+ * another.
+ */
+ while (cache->cached == BTRFS_CACHE_FAST) {
+ struct btrfs_caching_control *ctl;
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&cache->lock);
+
+ schedule();
+
+ finish_wait(&ctl->wait, &wait);
+ put_caching_control(ctl);
+ spin_lock(&cache->lock);
+ }
+
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
return 0;
+ }
+ WARN_ON(cache->caching_ctl);
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
/*
* We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
if (trans && (!trans->transaction->in_commit) &&
(root && root != root->fs_info->tree_root) &&
btrfs_test_opt(root, SPACE_CACHE)) {
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- return 0;
- }
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
-
ret = load_free_space_cache(fs_info, cache);
spin_lock(&cache->lock);
if (ret == 1) {
+ cache->caching_ctl = NULL;
cache->cached = BTRFS_CACHE_FINISHED;
cache->last_byte_to_unpin = (u64)-1;
} else {
- cache->cached = BTRFS_CACHE_NO;
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
}
spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
if (ret == 1) {
+ put_caching_control(caching_ctl);
free_excluded_extents(fs_info->extent_root, cache);
return 0;
}
+ } else {
+ /*
+ * We are not going to do the fast caching, set cached to the
+ * appropriate value and wakeup any waiters.
+ */
+ spin_lock(&cache->lock);
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ }
+ spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
}
- if (load_cache_only)
- return 0;
-
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
- BUG_ON(!caching_ctl);
-
- INIT_LIST_HEAD(&caching_ctl->list);
- mutex_init(&caching_ctl->mutex);
- init_waitqueue_head(&caching_ctl->wait);
- caching_ctl->block_group = cache;
- caching_ctl->progress = cache->key.objectid;
- /* one for caching kthread, one for caching block group list */
- atomic_set(&caching_ctl->count, 2);
- caching_ctl->work.func = caching_thread;
-
- spin_lock(&cache->lock);
- if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
- kfree(caching_ctl);
+ if (load_cache_only) {
+ put_caching_control(caching_ctl);
return 0;
}
- cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&cache->lock);
down_write(&fs_info->extent_commit_sem);
+ atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->extent_commit_sem);
@@ -2781,7 +2822,7 @@ out_free:
btrfs_release_path(path);
out:
spin_lock(&block_group->lock);
- if (!ret)
+ if (!ret && dcs == BTRFS_DC_SETUP)
block_group->cache_generation = trans->transid;
block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
@@ -3797,16 +3838,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
kfree(rsv);
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
+static inline int __block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int flush)
{
int ret;
if (num_bytes == 0)
return 0;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 1);
return 0;
@@ -3815,22 +3856,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
return ret;
}
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+
int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
{
- int ret;
-
- if (num_bytes == 0)
- return 0;
-
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
- return 0;
- }
-
- return ret;
+ return __block_rsv_add(root, block_rsv, num_bytes, 0);
}
int btrfs_block_rsv_check(struct btrfs_root *root,
@@ -3851,9 +3888,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -3872,7 +3909,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
if (!ret)
return 0;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 0);
return 0;
@@ -3881,6 +3918,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved)
+{
+ return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved)
+{
+ return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
+}
+
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes)
@@ -4064,23 +4115,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
*/
static unsigned drop_outstanding_extent(struct inode *inode)
{
+ unsigned drop_inode_space = 0;
unsigned dropped_extents = 0;
BUG_ON(!BTRFS_I(inode)->outstanding_extents);
BTRFS_I(inode)->outstanding_extents--;
+ if (BTRFS_I(inode)->outstanding_extents == 0 &&
+ BTRFS_I(inode)->delalloc_meta_reserved) {
+ drop_inode_space = 1;
+ BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ }
+
/*
* If we have more or the same amount of outsanding extents than we have
* reserved then we need to leave the reserved extents count alone.
*/
if (BTRFS_I(inode)->outstanding_extents >=
BTRFS_I(inode)->reserved_extents)
- return 0;
+ return drop_inode_space;
dropped_extents = BTRFS_I(inode)->reserved_extents -
BTRFS_I(inode)->outstanding_extents;
BTRFS_I(inode)->reserved_extents -= dropped_extents;
- return dropped_extents;
+ return dropped_extents + drop_inode_space;
}
/**
@@ -4146,12 +4204,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
u64 to_reserve = 0;
+ u64 csum_bytes;
unsigned nr_extents = 0;
+ int extra_reserve = 0;
int flush = 1;
int ret;
+ /* Need to be holding the i_mutex here if we aren't free space cache */
if (btrfs_is_free_space_inode(root, inode))
flush = 0;
+ else
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
if (flush && btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
@@ -4162,14 +4225,22 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
BTRFS_I(inode)->outstanding_extents++;
if (BTRFS_I(inode)->outstanding_extents >
- BTRFS_I(inode)->reserved_extents) {
+ BTRFS_I(inode)->reserved_extents)
nr_extents = BTRFS_I(inode)->outstanding_extents -
BTRFS_I(inode)->reserved_extents;
- BTRFS_I(inode)->reserved_extents += nr_extents;
- to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+ /*
+ * Add an item to reserve for updating the inode when we complete the
+ * delalloc io.
+ */
+ if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+ nr_extents++;
+ extra_reserve = 1;
}
+
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+ csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4179,22 +4250,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_lock(&BTRFS_I(inode)->lock);
dropped = drop_outstanding_extent(inode);
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
- spin_unlock(&BTRFS_I(inode)->lock);
- to_free += btrfs_calc_trans_metadata_size(root, dropped);
-
/*
- * Somebody could have come in and twiddled with the
- * reservation, so if we have to free more than we would have
- * reserved from this reservation go ahead and release those
- * bytes.
+ * If the inodes csum_bytes is the same as the original
+ * csum_bytes then we know we haven't raced with any free()ers
+ * so we can just reduce our inodes csum bytes and carry on.
+ * Otherwise we have to do the normal free thing to account for
+ * the case that the free side didn't free up its reserve
+ * because of this outstanding reservation.
*/
- to_free -= to_reserve;
+ if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+ calc_csum_metadata_size(inode, num_bytes, 0);
+ else
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
if (to_free)
btrfs_block_rsv_release(root, block_rsv, to_free);
return ret;
}
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (extra_reserve) {
+ BTRFS_I(inode)->delalloc_meta_reserved = 1;
+ nr_extents--;
+ }
+ BTRFS_I(inode)->reserved_extents += nr_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
block_rsv_add_bytes(block_rsv, to_reserve, 1);
return 0;
@@ -5040,11 +5124,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = orig_root->fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group_cache *used_block_group;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
- int last_ptr_loop = 0;
int loop = 0;
int index = 0;
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5106,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
+ used_block_group = block_group;
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -5143,6 +5228,7 @@ search:
u64 offset;
int cached;
+ used_block_group = block_group;
btrfs_get_block_group(block_group);
search_start = block_group->key.objectid;
@@ -5166,13 +5252,15 @@ search:
}
have_block_group:
- if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached)) {
u64 free_percent;
+ found_uncached_bg = true;
ret = cache_block_group(block_group, trans,
orig_root, 1);
if (block_group->cached == BTRFS_CACHE_FINISHED)
- goto have_block_group;
+ goto alloc;
free_percent = btrfs_block_group_used(&block_group->item);
free_percent *= 100;
@@ -5194,7 +5282,6 @@ have_block_group:
orig_root, 0);
BUG_ON(ret);
}
- found_uncached_bg = true;
/*
* If loop is set for cached only, try the next block
@@ -5204,94 +5291,80 @@ have_block_group:
goto loop;
}
- cached = block_group_cache_done(block_group);
- if (unlikely(!cached))
- found_uncached_bg = true;
-
+alloc:
if (unlikely(block_group->ro))
goto loop;
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
- num_bytes + empty_size) {
+ num_bytes + empty_cluster + empty_size) {
spin_unlock(&block_group->free_space_ctl->tree_lock);
goto loop;
}
spin_unlock(&block_group->free_space_ctl->tree_lock);
/*
- * Ok we want to try and use the cluster allocator, so lets look
- * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
- * have tried the cluster allocator plenty of times at this
- * point and not have found anything, so we are likely way too
- * fragmented for the clustering stuff to find anything, so lets
- * just skip it and let the allocator find whatever block it can
- * find
+ * Ok we want to try and use the cluster allocator, so
+ * lets look there
*/
- if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+ if (last_ptr) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
*/
spin_lock(&last_ptr->refill_lock);
- if (last_ptr->block_group &&
- (last_ptr->block_group->ro ||
- !block_group_bits(last_ptr->block_group, data))) {
- offset = 0;
+ used_block_group = last_ptr->block_group;
+ if (used_block_group != block_group &&
+ (!used_block_group ||
+ used_block_group->ro ||
+ !block_group_bits(used_block_group, data))) {
+ used_block_group = block_group;
goto refill_cluster;
}
- offset = btrfs_alloc_from_cluster(block_group, last_ptr,
- num_bytes, search_start);
+ if (used_block_group != block_group)
+ btrfs_get_block_group(used_block_group);
+
+ offset = btrfs_alloc_from_cluster(used_block_group,
+ last_ptr, num_bytes, used_block_group->key.objectid);
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
- spin_lock(&last_ptr->lock);
- /*
- * whoops, this cluster doesn't actually point to
- * this block group. Get a ref on the block
- * group is does point to and try again
- */
- if (!last_ptr_loop && last_ptr->block_group &&
- last_ptr->block_group != block_group &&
- index <=
- get_block_group_index(last_ptr->block_group)) {
-
- btrfs_put_block_group(block_group);
- block_group = last_ptr->block_group;
- btrfs_get_block_group(block_group);
- spin_unlock(&last_ptr->lock);
- spin_unlock(&last_ptr->refill_lock);
-
- last_ptr_loop = 1;
- search_start = block_group->key.objectid;
- /*
- * we know this block group is properly
- * in the list because
- * btrfs_remove_block_group, drops the
- * cluster before it removes the block
- * group from the list
- */
- goto have_block_group;
+ WARN_ON(last_ptr->block_group != used_block_group);
+ if (used_block_group != block_group) {
+ btrfs_put_block_group(used_block_group);
+ used_block_group = block_group;
}
- spin_unlock(&last_ptr->lock);
refill_cluster:
+ BUG_ON(used_block_group != block_group);
+ /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+ * set up a new clusters, so lets just skip it
+ * and let the allocator find whatever block
+ * it can find. If we reach this point, we
+ * will have tried the cluster allocator
+ * plenty of times and not have found
+ * anything, so we are likely way too
+ * fragmented for the clustering stuff to find
+ * anything. */
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/*
* this cluster didn't work out, free it and
* start over
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
- last_ptr_loop = 0;
-
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
- offset, num_bytes,
+ search_start, num_bytes,
empty_cluster + empty_size);
if (ret == 0) {
/*
@@ -5327,6 +5400,7 @@ refill_cluster:
goto loop;
}
+unclustered_alloc:
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5353,14 +5427,14 @@ checks:
search_start = stripe_align(root, offset);
/* move on to the next group */
if (search_start + num_bytes >= search_end) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
/* move on to the next group */
if (search_start + num_bytes >
- block_group->key.objectid + block_group->key.offset) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ used_block_group->key.objectid + used_block_group->key.offset) {
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
@@ -5368,14 +5442,14 @@ checks:
ins->offset = num_bytes;
if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
+ btrfs_add_free_space(used_block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
- ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+ ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
alloc_type);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, offset, num_bytes);
+ btrfs_add_free_space(used_block_group, offset, num_bytes);
goto loop;
}
@@ -5384,15 +5458,19 @@ checks:
ins->offset = num_bytes;
if (offset < search_start)
- btrfs_add_free_space(block_group, offset,
+ btrfs_add_free_space(used_block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
+ if (used_block_group != block_group)
+ btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
+ if (used_block_group != block_group)
+ btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);