summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c621
1 files changed, 158 insertions, 463 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d01631d47806..91b00eb2440e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,329 +30,13 @@
#include "delalloc-space.h"
#include "reflink.h"
#include "subpage.h"
-
-static struct kmem_cache *btrfs_inode_defrag_cachep;
-/*
- * when auto defrag is enabled we
- * queue up these defrag structs to remember which
- * inodes need defragging passes
- */
-struct inode_defrag {
- struct rb_node rb_node;
- /* objectid */
- u64 ino;
- /*
- * transid where the defrag was added, we search for
- * extents newer than this
- */
- u64 transid;
-
- /* root objectid */
- u64 root;
-
- /*
- * The extent size threshold for autodefrag.
- *
- * This value is different for compressed/non-compressed extents,
- * thus needs to be passed from higher layer.
- * (aka, inode_should_defrag())
- */
- u32 extent_thresh;
-};
-
-static int __compare_inode_defrag(struct inode_defrag *defrag1,
- struct inode_defrag *defrag2)
-{
- if (defrag1->root > defrag2->root)
- return 1;
- else if (defrag1->root < defrag2->root)
- return -1;
- else if (defrag1->ino > defrag2->ino)
- return 1;
- else if (defrag1->ino < defrag2->ino)
- return -1;
- else
- return 0;
-}
-
-/* pop a record for an inode into the defrag tree. The lock
- * must be held already
- *
- * If you're inserting a record for an older transid than an
- * existing record, the transid already in the tree is lowered
- *
- * If an existing record is found the defrag item you
- * pass in is freed
- */
-static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
-
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
-
- ret = __compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /* if we're reinserting an entry for
- * an old defrag run, make sure to
- * lower the transid of our existing record
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
- }
- set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
- return 0;
-}
-
-static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
-{
- if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
- return 0;
-
- if (btrfs_fs_closing(fs_info))
- return 0;
-
- return 1;
-}
-
-/*
- * insert a defrag record for this inode if auto defrag is
- * enabled
- */
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode_defrag *defrag;
- u64 transid;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- return 0;
-
- if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
- return 0;
-
- if (trans)
- transid = trans->transid;
- else
- transid = inode->root->last_trans;
-
- defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
- if (!defrag)
- return -ENOMEM;
-
- defrag->ino = btrfs_ino(inode);
- defrag->transid = transid;
- defrag->root = root->root_key.objectid;
- defrag->extent_thresh = extent_thresh;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
- /*
- * If we set IN_DEFRAG flag and evict the inode from memory,
- * and then re-read this inode, this new inode doesn't have
- * IN_DEFRAG flag. At the case, we may find the existed defrag.
- */
- ret = __btrfs_add_inode_defrag(inode, defrag);
- if (ret)
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
- return 0;
-}
-
-/*
- * pick the defragable inode that we want, if it doesn't exist, we will get
- * the next one.
- */
-static struct inode_defrag *
-btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
-{
- struct inode_defrag *entry = NULL;
- struct inode_defrag tmp;
- struct rb_node *p;
- struct rb_node *parent = NULL;
- int ret;
-
- tmp.ino = ino;
- tmp.root = root;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- p = fs_info->defrag_inodes.rb_node;
- while (p) {
- parent = p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
-
- ret = __compare_inode_defrag(&tmp, entry);
- if (ret < 0)
- p = parent->rb_left;
- else if (ret > 0)
- p = parent->rb_right;
- else
- goto out;
- }
-
- if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
- parent = rb_next(parent);
- if (parent)
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- else
- entry = NULL;
- }
-out:
- if (entry)
- rb_erase(parent, &fs_info->defrag_inodes);
- spin_unlock(&fs_info->defrag_inodes_lock);
- return entry;
-}
-
-void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
- struct inode_defrag *defrag;
- struct rb_node *node;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- node = rb_first(&fs_info->defrag_inodes);
- while (node) {
- rb_erase(node, &fs_info->defrag_inodes);
- defrag = rb_entry(node, struct inode_defrag, rb_node);
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-
- cond_resched_lock(&fs_info->defrag_inodes_lock);
-
- node = rb_first(&fs_info->defrag_inodes);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
-}
-
-#define BTRFS_DEFRAG_BATCH 1024
-
-static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
- struct inode_defrag *defrag)
-{
- struct btrfs_root *inode_root;
- struct inode *inode;
- struct btrfs_ioctl_defrag_range_args range;
- int ret = 0;
- u64 cur = 0;
-
-again:
- if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
- goto cleanup;
- if (!__need_auto_defrag(fs_info))
- goto cleanup;
-
- /* get the inode */
- inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
- if (IS_ERR(inode_root)) {
- ret = PTR_ERR(inode_root);
- goto cleanup;
- }
-
- inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
- btrfs_put_root(inode_root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto cleanup;
- }
-
- if (cur >= i_size_read(inode)) {
- iput(inode);
- goto cleanup;
- }
-
- /* do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
- memset(&range, 0, sizeof(range));
- range.len = (u64)-1;
- range.start = cur;
- range.extent_thresh = defrag->extent_thresh;
-
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
- iput(inode);
-
- if (ret < 0)
- goto cleanup;
-
- cur = max(cur + fs_info->sectorsize, range.start);
- goto again;
-
-cleanup:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return ret;
-}
-
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
- struct inode_defrag *defrag;
- u64 first_ino = 0;
- u64 root_objectid = 0;
-
- atomic_inc(&fs_info->defrag_running);
- while (1) {
- /* Pause the auto defragger. */
- if (test_bit(BTRFS_FS_STATE_REMOUNTING,
- &fs_info->fs_state))
- break;
-
- if (!__need_auto_defrag(fs_info))
- break;
-
- /* find an inode to defrag */
- defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
- first_ino);
- if (!defrag) {
- if (root_objectid || first_ino) {
- root_objectid = 0;
- first_ino = 0;
- continue;
- } else {
- break;
- }
- }
-
- first_ino = defrag->ino + 1;
- root_objectid = defrag->root;
-
- __btrfs_run_defrag_inode(fs_info, defrag);
- }
- atomic_dec(&fs_info->defrag_running);
-
- /*
- * during unmount, we use the transaction_wait queue to
- * wait for the defragger to stop
- */
- wake_up(&fs_info->transaction_wait);
- return 0;
-}
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "file-item.h"
+#include "ioctl.h"
+#include "file.h"
+#include "super.h"
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
@@ -696,7 +380,10 @@ next_slot:
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
key.offset = args->start;
}
@@ -783,7 +470,10 @@ delete_extent_item:
key.offset - extent_offset, 0,
false);
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
args->bytes_found += extent_end - key.offset;
}
@@ -1302,7 +992,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
struct btrfs_ordered_extent *ordered;
if (nowait) {
- if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+ if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state)) {
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1372,6 +1063,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
+ struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
@@ -1388,12 +1080,14 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
num_bytes = lockend - lockstart + 1;
if (nowait) {
- if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+ if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
+ &cached_state)) {
btrfs_drew_write_unlock(&root->snapshot_lock);
return -EAGAIN;
}
} else {
- btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
+ &cached_state);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, nowait, false);
@@ -1402,7 +1096,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
else
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
- unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
return ret;
}
@@ -1505,7 +1199,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
- ret = btrfs_inode_lock(inode, ilock_flags);
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (ret < 0)
return ret;
@@ -1740,7 +1434,7 @@ again:
iocb->ki_pos += num_written;
}
out:
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return num_written ? num_written : ret;
}
@@ -1780,19 +1474,19 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
- err = btrfs_inode_lock(inode, ilock_flags);
+ err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (err < 0)
return err;
err = generic_write_checks(iocb, from);
if (err <= 0) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return err;
}
err = btrfs_write_check(iocb, from, err);
if (err < 0) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
}
@@ -1803,13 +1497,13 @@ relock:
*/
if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
pos + iov_iter_count(from) > i_size_read(inode)) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
ilock_flags &= ~BTRFS_ILOCK_SHARED;
goto relock;
}
if (check_direct_IO(fs_info, from, pos)) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
@@ -1840,7 +1534,7 @@ relock:
* iocb, and that needs to lock the inode. So unlock it before calling
* iomap_dio_complete() to avoid a deadlock.
*/
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
if (IS_ERR_OR_NULL(dio))
err = PTR_ERR_OR_ZERO(dio);
@@ -1887,8 +1581,8 @@ buffered:
/*
* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
* it must retry the operation in a context where blocking is acceptable,
- * since we currently don't have NOWAIT semantics support for buffered IO
- * and may block there for many reasons (reserving space for example).
+ * because even if we end up not blocking during the buffered IO attempt
+ * below, we will block when flushing and waiting for the IO.
*/
if (iocb->ki_flags & IOCB_NOWAIT) {
err = -EAGAIN;
@@ -1928,7 +1622,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
loff_t count;
ssize_t ret;
- btrfs_inode_lock(inode, 0);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
count = encoded->len;
ret = generic_write_checks_count(iocb, &count);
if (ret == 0 && count != encoded->len) {
@@ -1947,7 +1641,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_do_encoded_write(iocb, from, encoded);
out:
- btrfs_inode_unlock(inode, 0);
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
return ret;
}
@@ -2008,10 +1702,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
- if (private && private->filldir_buf)
+ if (private) {
kfree(private->filldir_buf);
- kfree(private);
- filp->private_data = NULL;
+ free_extent_state(private->llseek_cached_state);
+ kfree(private);
+ filp->private_data = NULL;
+ }
/*
* Set by setattr when we are about to truncate a file from a non-zero
@@ -2118,7 +1814,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -2142,7 +1838,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2245,7 +1941,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@@ -2313,7 +2009,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2908,7 +2604,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
@@ -2956,7 +2652,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
}
@@ -3055,7 +2751,7 @@ out_only_mutex:
ret = ret2;
}
}
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3366,7 +3062,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_PUNCH_HOLE)
return btrfs_punch_hole(file, offset, len);
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3416,7 +3112,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3514,7 +3210,7 @@ out_unlock:
unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
out:
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
return ret;
}
@@ -3526,117 +3222,106 @@ out:
* looping while it gets adjacent subranges, and merging them together.
*/
static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
+ bool *search_io_tree,
u64 *delalloc_start_ret, u64 *delalloc_end_ret)
{
- const u64 len = end + 1 - start;
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- u64 em_end;
- u64 delalloc_len;
+ u64 len = end + 1 - start;
+ u64 delalloc_len = 0;
+ struct btrfs_ordered_extent *oe;
+ u64 oe_start;
+ u64 oe_end;
/*
* Search the io tree first for EXTENT_DELALLOC. If we find any, it
* means we have delalloc (dirty pages) for which writeback has not
* started yet.
*/
- *delalloc_start_ret = start;
- delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
- len, EXTENT_DELALLOC, 1);
- /*
- * If delalloc was found then *delalloc_start_ret has a sector size
- * aligned value (rounded down).
- */
- if (delalloc_len > 0)
+ if (*search_io_tree) {
+ spin_lock(&inode->lock);
+ if (inode->delalloc_bytes > 0) {
+ spin_unlock(&inode->lock);
+ *delalloc_start_ret = start;
+ delalloc_len = count_range_bits(&inode->io_tree,
+ delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1,
+ cached_state);
+ } else {
+ spin_unlock(&inode->lock);
+ }
+ }
+
+ if (delalloc_len > 0) {
+ /*
+ * If delalloc was found then *delalloc_start_ret has a sector size
+ * aligned value (rounded down).
+ */
*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+ if (*delalloc_start_ret == start) {
+ /* Delalloc for the whole range, nothing more to do. */
+ if (*delalloc_end_ret == end)
+ return true;
+ /* Else trim our search range for ordered extents. */
+ start = *delalloc_end_ret + 1;
+ len = end + 1 - start;
+ }
+ } else {
+ /* No delalloc, future calls don't need to search again. */
+ *search_io_tree = false;
+ }
+
/*
- * Now also check if there's any extent map in the range that does not
- * map to a hole or prealloc extent. We do this because:
+ * Now also check if there's any ordered extent in the range.
+ * We do this because:
*
* 1) When delalloc is flushed, the file range is locked, we clear the
- * EXTENT_DELALLOC bit from the io tree and create an extent map for
- * an allocated extent. So we might just have been called after
- * delalloc is flushed and before the ordered extent completes and
- * inserts the new file extent item in the subvolume's btree;
+ * EXTENT_DELALLOC bit from the io tree and create an extent map and
+ * an ordered extent for the write. So we might just have been called
+ * after delalloc is flushed and before the ordered extent completes
+ * and inserts the new file extent item in the subvolume's btree;
*
- * 2) We may have an extent map created by flushing delalloc for a
+ * 2) We may have an ordered extent created by flushing delalloc for a
* subrange that starts before the subrange we found marked with
* EXTENT_DELALLOC in the io tree.
+ *
+ * We could also use the extent map tree to find such delalloc that is
+ * being flushed, but using the ordered extents tree is more efficient
+ * because it's usually much smaller as ordered extents are removed from
+ * the tree once they complete. With the extent maps, we mau have them
+ * in the extent map tree for a very long time, and they were either
+ * created by previous writes or loaded by read operations.
*/
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- read_unlock(&em_tree->lock);
-
- /* extent_map_end() returns a non-inclusive end offset. */
- em_end = em ? extent_map_end(em) : 0;
-
- /*
- * If we have a hole/prealloc extent map, check the next one if this one
- * ends before our range's end.
- */
- if (em && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
- struct extent_map *next_em;
-
- read_lock(&em_tree->lock);
- next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
- read_unlock(&em_tree->lock);
-
- free_extent_map(em);
- em_end = next_em ? extent_map_end(next_em) : 0;
- em = next_em;
- }
+ oe = btrfs_lookup_first_ordered_range(inode, start, len);
+ if (!oe)
+ return (delalloc_len > 0);
- if (em && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- free_extent_map(em);
- em = NULL;
- }
+ /* The ordered extent may span beyond our search range. */
+ oe_start = max(oe->file_offset, start);
+ oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
- /*
- * No extent map or one for a hole or prealloc extent. Use the delalloc
- * range we found in the io tree if we have one.
- */
- if (!em)
- return (delalloc_len > 0);
+ btrfs_put_ordered_extent(oe);
- /*
- * We don't have any range as EXTENT_DELALLOC in the io tree, so the
- * extent map is the only subrange representing delalloc.
- */
+ /* Don't have unflushed delalloc, return the ordered extent range. */
if (delalloc_len == 0) {
- *delalloc_start_ret = em->start;
- *delalloc_end_ret = min(end, em_end - 1);
- free_extent_map(em);
+ *delalloc_start_ret = oe_start;
+ *delalloc_end_ret = oe_end;
return true;
}
/*
- * The extent map represents a delalloc range that starts before the
- * delalloc range we found in the io tree.
+ * We have both unflushed delalloc (io_tree) and an ordered extent.
+ * If the ranges are adjacent returned a combined range, otherwise
+ * return the leftmost range.
*/
- if (em->start < *delalloc_start_ret) {
- *delalloc_start_ret = em->start;
- /*
- * If the ranges are adjacent, return a combined range.
- * Otherwise return the extent map's range.
- */
- if (em_end < *delalloc_start_ret)
- *delalloc_end_ret = min(end, em_end - 1);
-
- free_extent_map(em);
- return true;
+ if (oe_start < *delalloc_start_ret) {
+ if (oe_end < *delalloc_start_ret)
+ *delalloc_end_ret = oe_end;
+ *delalloc_start_ret = oe_start;
+ } else if (*delalloc_end_ret + 1 == oe_start) {
+ *delalloc_end_ret = oe_end;
}
- /*
- * The extent map starts after the delalloc range we found in the io
- * tree. If it's adjacent, return a combined range, otherwise return
- * the range found in the io tree.
- */
- if (*delalloc_end_ret + 1 == em->start)
- *delalloc_end_ret = min(end, em_end - 1);
-
- free_extent_map(em);
return true;
}
@@ -3648,6 +3333,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* sector size aligned.
* @end: The end offset (inclusive value) of the search range.
* It does not need to be sector size aligned.
+ * @cached_state: Extent state record used for speeding up delalloc
+ * searches in the inode's io_tree. Can be NULL.
* @delalloc_start_ret: Output argument, set to the start offset of the
* subrange found with delalloc (may not be sector size
* aligned).
@@ -3659,10 +3346,12 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* end offsets of the subrange.
*/
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret)
{
u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
u64 prev_delalloc_end = 0;
+ bool search_io_tree = true;
bool ret = false;
while (cur_offset < end) {
@@ -3671,6 +3360,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
bool delalloc;
delalloc = find_delalloc_subrange(inode, cur_offset, end,
+ cached_state, &search_io_tree,
&delalloc_start,
&delalloc_end);
if (!delalloc)
@@ -3716,13 +3406,14 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
* is found, it updates @start_ret with the start of the subrange.
*/
static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+ struct extent_state **cached_state,
u64 start, u64 end, u64 *start_ret)
{
u64 delalloc_start;
u64 delalloc_end;
bool delalloc;
- delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+ delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
&delalloc_start, &delalloc_end);
if (delalloc && whence == SEEK_DATA) {
*start_ret = delalloc_start;
@@ -3765,11 +3456,13 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
return false;
}
-static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
- int whence)
+static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
{
+ struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
+ struct btrfs_file_private *private = file->private_data;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_state *cached_state = NULL;
+ struct extent_state **delalloc_cached_state;
const loff_t i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_root *root = inode->root;
@@ -3794,6 +3487,22 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
inode_get_bytes(&inode->vfs_inode) == i_size)
return i_size;
+ if (!private) {
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
+ /*
+ * No worries if memory allocation failed.
+ * The private structure is used only for speeding up multiple
+ * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
+ * so everything will still be correct.
+ */
+ file->private_data = private;
+ }
+
+ if (private)
+ delalloc_cached_state = &private->llseek_cached_state;
+ else
+ delalloc_cached_state = NULL;
+
/*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
@@ -3871,6 +3580,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
search_start = offset;
found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state,
search_start,
key.offset - 1,
&found_start);
@@ -3905,6 +3615,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
search_start = offset;
found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state,
search_start,
extent_end - 1,
&found_start);
@@ -3946,7 +3657,8 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
/* We have an implicit hole from the last extent found up to i_size. */
if (!found && start < i_size) {
- found = find_desired_extent_in_hole(inode, whence, start,
+ found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state, start,
i_size - 1, &start);
if (!found)
start = i_size;
@@ -3974,9 +3686,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- offset = find_desired_extent(BTRFS_I(inode), offset, whence);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ offset = find_desired_extent(file, offset, whence);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
break;
}
@@ -4031,7 +3743,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
again:
/*
* This is similar to what we do for direct IO writes, see the comment
@@ -4080,7 +3792,7 @@ again:
goto again;
}
}
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
return ret < 0 ? ret : read;
}
@@ -4117,23 +3829,6 @@ const struct file_operations btrfs_file_operations = {
.remap_file_range = btrfs_remap_file_range,
};
-void __cold btrfs_auto_defrag_exit(void)
-{
- kmem_cache_destroy(btrfs_inode_defrag_cachep);
-}
-
-int __init btrfs_auto_defrag_init(void)
-{
- btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
- sizeof(struct inode_defrag), 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_inode_defrag_cachep)
- return -ENOMEM;
-
- return 0;
-}
-
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
{
int ret;