diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 179 |
1 files changed, 124 insertions, 55 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 855935f6671a..a0fa7253a2d7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1845,8 +1845,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int ret; ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -1906,8 +1908,10 @@ mapit: ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); out: - if (ret < 0) - bio_endio(bio, ret); + if (ret < 0) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -3654,6 +3658,35 @@ cache_index: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We don't persist the id of the transaction where an unlink operation + * against the inode was last made. So here we assume the inode might + * have been evicted, and therefore the exact value of last_unlink_trans + * lost, and set it to last_trans to avoid metadata inconsistencies + * between the inode and its parent if the inode is fsync'ed and the log + * replayed. For example, in the scenario: + * + * touch mydir/foo + * ln mydir/foo mydir/bar + * sync + * unlink mydir/bar + * echo 2 > /proc/sys/vm/drop_caches # evicts inode + * xfs_io -c fsync mydir/foo + * <power failure> + * mount fs, triggers fsync log replay + * + * We must make sure that when we fsync our inode foo we also log its + * parent inode, otherwise after log replay the parent still has the + * dentry with the "bar" name but our inode foo has a link count of 1 + * and doesn't have an inode ref with the name "bar" anymore. + * + * Setting last_unlink_trans to last_trans is a pessimistic approach, + * but it guarantees correctness at the expense of ocassional full + * transaction commits on fsync if our inode is a directory, or if our + * inode is not a directory, logging its parent unnecessarily. + */ + BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -4209,7 +4242,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 last_size = (u64)-1; + u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4493,8 +4526,7 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); @@ -4989,8 +5021,9 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. * We can have ongoing bios started by readpages (called from readahead) - * that didn't get their end io callbacks called yet or they are still - * in progress ((extent_io.c:end_bio_extent_readpage()). This means some + * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before * submitting those bios, which are executed by a separate task (work * queue kthread), inode references (inode->i_count) were not taken @@ -6876,8 +6909,7 @@ out: trace_btrfs_get_extent(root, em); - if (path) - btrfs_free_path(path); + btrfs_free_path(path); if (trans) { ret = btrfs_end_transaction(trans, root); if (!err) @@ -7546,6 +7578,7 @@ unlock: current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); + set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); } /* @@ -7687,13 +7720,13 @@ struct btrfs_retry_complete { int uptodate; }; -static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct bio_vec *bvec; int i; - if (err) + if (bio->bi_error) goto end; done->uptodate = 1; @@ -7742,7 +7775,7 @@ try_again: return 0; } -static void btrfs_retry_endio(struct bio *bio, int err) +static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); @@ -7751,7 +7784,7 @@ static void btrfs_retry_endio(struct bio *bio, int err) int ret; int i; - if (err) + if (bio->bi_error) goto end; uptodate = 1; @@ -7834,12 +7867,13 @@ static int btrfs_subio_endio_read(struct inode *inode, } } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static void btrfs_endio_direct_read(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + int err = bio->bi_error; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -7850,17 +7884,14 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip); - /* If we had a csum failure make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) io_bio->end_io(io_bio, err); bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio, int err) +static void btrfs_endio_direct_write(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; @@ -7871,12 +7902,11 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct bio *dio_bio; int ret; - if (err) - goto out_done; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes, !err); + ordered_bytes, + !bio->bi_error); if (!ret) goto out_test; @@ -7895,15 +7925,11 @@ out_test: ordered = NULL; goto again; } -out_done: dio_bio = dip->dio_bio; kfree(dip); - /* If we had an error make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -7918,9 +7944,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } -static void btrfs_end_dio_bio(struct bio *bio, int err) +static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + int err = bio->bi_error; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -7949,8 +7976,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); - bio_endio(dip->orig_bio, 0); + dip->dio_bio->bi_error = 0; + bio_endio(dip->orig_bio); } out: bio_put(bio); @@ -7959,8 +7986,11 @@ out: static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, u64 first_sector, gfp_t gfp_flags) { - int nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); + struct bio *bio; + bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); + if (bio) + bio_associate_current(bio); + return bio; } static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, @@ -8163,9 +8193,8 @@ out_err: static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio *io_bio; + struct btrfs_dio_private *dip = NULL; + struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -8182,7 +8211,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_io_bio; + goto free_ordered; } dip->private = dio_bio->bi_private; @@ -8210,25 +8239,56 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); -free_io_bio: - bio_put(io_bio); free_ordered: /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. + * If we arrived here it means either we failed to submit the dip + * or we either failed to clone the dio_bio or failed to allocate the + * dip. If we cloned the dio_bio and allocated the dip, we can just + * call bio_endio against our io_bio so that we get proper resource + * cleanup if we fail to submit the dip, otherwise, we must do the + * same as btrfs_endio_direct_[write|read] because we can't call these + * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len, 1); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); + if (io_bio && dip) { + io_bio->bi_error = -EIO; + bio_endio(io_bio); + /* + * The end io callbacks free our dip, do the final put on io_bio + * and all the cleanup and final put for dio_bio (through + * dio_end_io()). + */ + dip = NULL; + io_bio = NULL; + } else { + if (write) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, + file_offset); + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + /* + * Decrements our ref on the ordered extent and removes + * the ordered extent from the inode's ordered tree, + * doing all the proper resource cleanup such as for the + * reserved space and waking up any waiters for this + * ordered extent (through btrfs_remove_ordered_extent). + */ + btrfs_finish_ordered_io(ordered); + } else { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + dio_bio->bi_error = -EIO; + /* + * Releases and cleans up our dio_bio, no need to bio_put() + * nor bio_endio()/bio_io_error() against dio_bio. + */ + dio_end_io(dio_bio, ret); } - bio_endio(dio_bio, ret); + if (io_bio) + bio_put(io_bio); + kfree(dip); } static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, @@ -8330,9 +8390,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) - btrfs_delalloc_release_space(inode, count); - else if (ret >= 0 && (size_t)ret < count) + if (ret < 0 && ret != -EIOCBQUEUED) { + /* + * If the error comes from submitting stage, + * btrfs_get_blocsk_direct() has free'd data space, + * and metadata space will be handled by + * finish_ordered_fn, don't do that again to make + * sure bytes_may_use is correct. + */ + if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, + &BTRFS_I(inode)->runtime_flags)) + btrfs_delalloc_release_space(inode, count); + } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); } |