summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c287
1 files changed, 166 insertions, 121 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2776112dbdf8..f688fab55251 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -396,15 +396,14 @@ again:
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, cached_state);
+
+ unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end,
- &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
- free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -413,9 +412,10 @@ out_failed:
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
+ struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops);
@@ -667,6 +667,37 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/*
+ * Populate every free slot in a provided array with folios.
+ *
+ * @nr_folios: number of folios to allocate
+ * @folio_array: the array to fill with folios; any existing non-NULL entries in
+ * the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation
+ *
+ * Return: 0 if all folios were able to be allocated;
+ * -ENOMEM otherwise, the partially allocated folios would be freed and
+ * the array slots zeroed
+ */
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+ gfp_t extra_gfp)
+{
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ continue;
+ folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
+ if (!folio_array[i])
+ goto error;
+ }
+ return 0;
+error:
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ folio_put(folio_array[i]);
+ }
+ return -ENOMEM;
+}
+
+/*
* Populate every free slot in a provided array with pages.
*
* @nr_pages: number of pages to allocate
@@ -1571,7 +1602,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* can be no longer dirty nor marked anymore for writeback (if a
* subsequent modification to the extent buffer didn't happen before the
* transaction commit), which makes filemap_fdata[write|wait]_range not
- * able to find the pages tagged with SetPageError at transaction
+ * able to find the pages which contain errors at transaction
* commit time. So if this happens we must abort the transaction,
* otherwise we commit a super block with btree roots that point to
* btree nodes/leafs whose content on disk is invalid - either garbage
@@ -2246,8 +2277,7 @@ next_page:
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
}
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -2267,7 +2297,7 @@ int extent_writepages(struct address_space *mapping,
return ret;
}
-void extent_readahead(struct readahead_control *rac)
+void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
struct page *pagepool[16];
@@ -2325,19 +2355,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-static int try_release_extent_state(struct extent_io_tree *tree,
+static bool try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- int ret = 1;
+ bool ret;
if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
- ret = 0;
+ ret = false;
} else {
u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
EXTENT_QGROUP_RESERVED);
+ int ret2;
/*
* At this point we can safely clear everything except the
@@ -2345,15 +2376,15 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
*/
- if (ret < 0)
- ret = 0;
+ if (ret2 < 0)
+ ret = false;
else
- ret = 1;
+ ret = true;
}
return ret;
}
@@ -2363,84 +2394,80 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-int try_release_extent_mapping(struct page *page, gfp_t mask)
+bool try_release_extent_mapping(struct page *page, gfp_t mask)
{
- struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- struct btrfs_inode *btrfs_inode = page_to_inode(page);
- struct extent_io_tree *tree = &btrfs_inode->io_tree;
- struct extent_map_tree *map = &btrfs_inode->extent_tree;
-
- if (gfpflags_allow_blocking(mask) &&
- page->mapping->host->i_size > SZ_16M) {
- u64 len;
- while (start <= end) {
- struct btrfs_fs_info *fs_info;
- u64 cur_gen;
-
- len = end - start + 1;
- write_lock(&map->lock);
- em = lookup_extent_mapping(map, start, len);
- if (!em) {
- write_unlock(&map->lock);
- break;
- }
- if ((em->flags & EXTENT_FLAG_PINNED) ||
- em->start != start) {
- write_unlock(&map->lock);
- free_extent_map(em);
- break;
- }
- if (test_range_bit_exists(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED))
- goto next;
- /*
- * If it's not in the list of modified extents, used
- * by a fast fsync, we can remove it. If it's being
- * logged we can safely remove it since fsync took an
- * extra reference on the em.
- */
- if (list_empty(&em->list) ||
- (em->flags & EXTENT_FLAG_LOGGING))
- goto remove_em;
- /*
- * If it's in the list of modified extents, remove it
- * only if its generation is older then the current one,
- * in which case we don't need it for a fast fsync.
- * Otherwise don't remove it, we could be racing with an
- * ongoing fast fsync that could miss the new extent.
- */
- fs_info = btrfs_inode->root->fs_info;
- spin_lock(&fs_info->trans_lock);
- cur_gen = fs_info->generation;
- spin_unlock(&fs_info->trans_lock);
- if (em->generation >= cur_gen)
- goto next;
-remove_em:
- /*
- * We only remove extent maps that are not in the list of
- * modified extents or that are in the list but with a
- * generation lower then the current generation, so there
- * is no need to set the full fsync flag on the inode (it
- * hurts the fsync performance for workloads with a data
- * size that exceeds or is close to the system's memory).
- */
- remove_extent_mapping(map, em);
- /* once for the rb tree */
+ struct btrfs_inode *inode = page_to_inode(page);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+
+ while (start <= end) {
+ const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ const u64 len = end - start + 1;
+ struct extent_map_tree *extent_tree = &inode->extent_tree;
+ struct extent_map *em;
+
+ write_lock(&extent_tree->lock);
+ em = lookup_extent_mapping(extent_tree, start, len);
+ if (!em) {
+ write_unlock(&extent_tree->lock);
+ break;
+ }
+ if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
+ write_unlock(&extent_tree->lock);
free_extent_map(em);
+ break;
+ }
+ if (test_range_bit_exists(io_tree, em->start,
+ extent_map_end(em) - 1, EXTENT_LOCKED))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used by a fast
+ * fsync, we can remove it. If it's being logged we can safely
+ * remove it since fsync took an extra reference on the em.
+ */
+ if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it only if
+ * its generation is older then the current one, in which case
+ * we don't need it for a fast fsync. Otherwise don't remove it,
+ * we could be racing with an ongoing fast fsync that could miss
+ * the new extent.
+ */
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there is no
+ * need to set the full fsync flag on the inode (it hurts the
+ * fsync performance for workloads with a data size that exceeds
+ * or is close to the system's memory).
+ */
+ remove_extent_mapping(inode, em);
+ /* Once for the inode's extent map tree. */
+ free_extent_map(em);
next:
- start = extent_map_end(em);
- write_unlock(&map->lock);
+ start = extent_map_end(em);
+ write_unlock(&extent_tree->lock);
- /* once for us */
- free_extent_map(em);
+ /* Once for us, for the lookup_extent_mapping() reference. */
+ free_extent_map(em);
+
+ if (need_resched()) {
+ /*
+ * If we need to resched but we can't block just exit
+ * and leave any remaining extent maps.
+ */
+ if (!gfpflags_allow_blocking(mask))
+ break;
- cond_resched(); /* Allow large-extent preemption. */
+ cond_resched();
}
}
- return try_release_extent_state(tree, page, mask);
+ return try_release_extent_state(io_tree, page, mask);
}
struct btrfs_fiemap_entry {
@@ -2773,13 +2800,19 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
goto out;
}
- /* See the comment at fiemap_search_slot() about why we clone. */
- copy_extent_buffer_full(clone, path->nodes[0]);
/*
* Important to preserve the start field, for the optimizations when
* checking if extents are shared (see extent_fiemap()).
+ *
+ * We must set ->start before calling copy_extent_buffer_full(). If we
+ * are on sub-pagesize blocksize, we use ->start to determine the offset
+ * into the folio where our eb exists, and if we update ->start after
+ * the fact then any subsequent reads of the eb may read from a
+ * different offset in the folio than where we originally copied into.
*/
clone->start = path->nodes[0]->start;
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ copy_extent_buffer_full(clone, path->nodes[0]);
slot = path->slots[0];
btrfs_release_path(path);
@@ -3656,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer(
struct folio *folio = page_folio(page);
struct extent_buffer *exists;
+ lockdep_assert_held(&page->mapping->i_private_lock);
+
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
@@ -3723,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct btrfs_subpage *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio;
+ struct folio *existing_folio = NULL;
int ret;
ASSERT(found_eb_ret);
@@ -3741,12 +3777,14 @@ retry:
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
- return 0;
+ goto finish;
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio))
+ if (IS_ERR(existing_folio)) {
+ existing_folio = NULL;
goto retry;
+ }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -3757,14 +3795,13 @@ retry:
return -EAGAIN;
}
- if (fs_info->nodesize < PAGE_SIZE) {
- /*
- * We're going to reuse the existing page, can drop our page
- * and subpage structure now.
- */
+finish:
+ spin_lock(&mapping->i_private_lock);
+ if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ /* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
- } else {
+ } else if (existing_folio) {
struct extent_buffer *existing_eb;
existing_eb = grab_extent_buffer(fs_info,
@@ -3772,6 +3809,7 @@ retry:
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
+ spin_unlock(&mapping->i_private_lock);
folio_unlock(existing_folio);
folio_put(existing_folio);
return 1;
@@ -3780,6 +3818,22 @@ retry:
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
}
+ eb->folio_size = folio_size(eb->folios[i]);
+ eb->folio_shift = folio_shift(eb->folios[i]);
+ /* Should not fail, as we have preallocated the memory. */
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have an extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the folio private when the
+ * eb hasn't been inserted into radix tree yet.
+ *
+ * The ref will be decreased when the eb releases the page, in
+ * detach_extent_buffer_page(). Thus needs no special handling in the
+ * error path.
+ */
+ btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
@@ -3791,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
@@ -3857,7 +3910,7 @@ reallocate:
for (int i = 0; i < num_folios; i++) {
struct folio *folio;
- ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
if (ret > 0) {
ASSERT(existing_eb);
goto out;
@@ -3894,24 +3947,6 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- eb->folio_size = folio_size(folio);
- eb->folio_shift = folio_shift(folio);
- spin_lock(&mapping->i_private_lock);
- /* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_folio(eb, folio, prealloc);
- ASSERT(!ret);
- /*
- * To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the folio private
- * when the eb hasn't yet been inserted into radix tree.
- *
- * The ref will be decreased when the eb released the page, in
- * detach_extent_buffer_page().
- * Thus needs no special handling in error path.
- */
- btrfs_folio_inc_eb_refs(fs_info, folio);
- spin_unlock(&mapping->i_private_lock);
-
WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
/*
@@ -4261,6 +4296,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
}
+static void clear_extent_buffer_reading(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+}
+
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
@@ -4269,6 +4311,13 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
struct folio_iter fi;
u32 bio_offset = 0;
+ /*
+ * If the extent buffer is marked UPTODATE before the read operation
+ * completes, other calls to read_extent_buffer_pages() will return
+ * early without waiting for the read to finish, causing data races.
+ */
+ WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));
+
eb->read_mirror = bbio->mirror_num;
if (uptodate &&
@@ -4295,9 +4344,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
bio_offset += len;
}
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
bio_put(&bbio->bio);
@@ -4331,9 +4378,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
* will now be set, and we shouldn't read it in again.
*/
if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
return 0;
}