diff options
Diffstat (limited to 'fs')
66 files changed, 1199 insertions, 291 deletions
diff --git a/fs/affs/file.c b/fs/affs/file.c index d2468bf95669..a91795e01a7f 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -699,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, boff = tmp % bsize; if (boff) { bh = affs_bread_ino(inode, bidx, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } tmp = min(bsize - boff, to - from); BUG_ON(boff + tmp > bsize || tmp > bsize); memcpy(AFFS_DATA(bh) + boff, data + from, tmp); @@ -712,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, bidx++; } else if (bidx) { bh = affs_bread_ino(inode, bidx - 1, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } } while (from + bsize <= to) { prev_bh = bh; bh = affs_getemptyblk_ino(inode, bidx); if (IS_ERR(bh)) - goto out; + goto err_bh; memcpy(AFFS_DATA(bh), data + from, bsize); if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); @@ -751,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, prev_bh = bh; bh = affs_bread_ino(inode, bidx, 1); if (IS_ERR(bh)) - goto out; + goto err_bh; tmp = min(bsize, to - from); BUG_ON(tmp > bsize); memcpy(AFFS_DATA(bh), data + from, tmp); @@ -790,12 +794,13 @@ done: if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; +err_first_bh: unlock_page(page); page_cache_release(page); return written; -out: +err_bh: bh = prev_bh; if (!written) written = PTR_ERR(bh); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 993642199326..6d67f32e648d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1645,14 +1645,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, parent_nritems = btrfs_header_nritems(parent); blocksize = root->nodesize; - end_slot = parent_nritems; + end_slot = parent_nritems - 1; - if (parent_nritems == 1) + if (parent_nritems <= 1) return 0; btrfs_set_lock_blocking(parent); - for (i = start_slot; i < end_slot; i++) { + for (i = start_slot; i <= end_slot; i++) { int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1669,7 +1669,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, other = btrfs_node_blockptr(parent, i - 1); close = close_blocks(blocknr, other, blocksize); } - if (!close && i < end_slot - 2) { + if (!close && i < end_slot) { other = btrfs_node_blockptr(parent, i + 1); close = close_blocks(blocknr, other, blocksize); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84c3b00f3de8..f9c89cae39ee 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3387,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); @@ -3909,6 +3911,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, loff_t actual_len, u64 *alloc_hint); int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); +#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f79f38542a73..639f2663ed3f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3921,7 +3921,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)) { - printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n", + printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n", btrfs_super_sys_array_size(sb), sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 571f402d3fc4..8b353ad02f03 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3208,6 +3208,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, return 0; } + if (trans->aborted) + return 0; again: inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -3243,6 +3245,20 @@ again: */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } WARN_ON(ret); if (i_size_read(inode) > 0) { @@ -3309,6 +3325,32 @@ out: return ret; } +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(root, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -5094,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + nr_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + BTRFS_I(inode)->outstanding_extents += nr_extents; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) @@ -5239,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + if (btrfs_test_is_dummy_root(root)) + return; + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c7233ff1d533..d688cfe5d496 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4968,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb) /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { + __free_extent_buffer(eb); + return 1; + } +#endif call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b78bbbac900d..30982bbd31c3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, mutex_unlock(&inode->i_mutex); /* - * we want to make sure fsync finds this change - * but we haven't joined a transaction running right now. - * - * Later on, someone is sure to update the inode and get the - * real transid recorded. - * - * We set last_trans now to the fs_info generation + 1, - * this will either be one more than the running transaction - * or the generation used for the next transaction if there isn't - * one running right now. - * * We also have to set last_sub_trans to the current log transid, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); @@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * check the transaction that last modified this inode - * and see if its already been committed - */ - if (!BTRFS_I(inode)->last_trans) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* - * if the last transaction that changed this file was before - * the current transaction, we can bail out now without any - * syncing + * If the last transaction that changed this file was before the current + * transaction and we have the full sync flag set in our inode, we can + * bail out now without any syncing. + * + * Note that we can't bail out if the full sync flag isn't set. This is + * because when the full sync flag is set we start all ordered extents + * and wait for them to fully complete - when they complete they update + * the inode's last_trans field through: + * + * btrfs_finish_ordered_io() -> + * btrfs_update_inode_fallback() -> + * btrfs_update_inode() -> + * btrfs_set_inode_last_trans() + * + * So we are sure that last_trans is up to date and can do this check to + * bail out safely. For the fast path, when the full sync flag is not + * set in our inode, we can not do it because we start only our ordered + * extents and don't wait for them to complete (that is when + * btrfs_finish_ordered_io runs), so here at this point their last_trans + * value might be less than or equals to fs_info->last_trans_committed, + * and setting a speculative last_trans for an inode when a buffered + * write is made (such as fs_info->generation + 1 for example) would not + * be reliable since after setting the value and before fsync is called + * any number of transactions can start and commit (transaction kthread + * commits the current transaction periodically), and a transaction + * commit does not start nor waits for ordered extents to complete. */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed) { - BTRFS_I(inode)->last_trans = 0; - + (full_sync && BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed)) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2275,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; + bool truncated_page = false; + bool updated_inode = false; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2306,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < ino_size) + if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, len, 0); + } else { + ret = 0; + } goto out_only_mutex; } /* zero back part of the first page */ if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2348,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, tail_start + tail_len, 0, 1); if (ret) @@ -2357,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (lockend < lockstart) { - mutex_unlock(&inode->i_mutex); - return 0; + ret = 0; + goto out_only_mutex; } while (1) { @@ -2506,6 +2514,7 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); + updated_inode = true; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); out_free: @@ -2515,6 +2524,22 @@ out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); out_only_mutex: + if (!updated_inode && truncated_page && !ret && !err) { + /* + * If we only end up zeroing part of a page, we still need to + * update the inode item, so that all the time fields are + * updated as well as the necessary btrfs inode in memory fields + * for detecting, at fsync time, if the inode isn't yet in the + * log tree or it's there but not up to date. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + } else { + err = btrfs_update_inode(trans, root, inode); + ret = btrfs_end_transaction(trans, root); + } + } mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a85c23dfcddb..d2e732d7af52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, static int btrfs_dirty_inode(struct inode *inode); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode) +{ + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +} +#endif + static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -1542,30 +1549,17 @@ static void btrfs_split_extent_hook(struct inode *inode, u64 new_size; /* - * We need the largest size of the remaining extent to see if we - * need to add a new outstanding extent. Think of the following - * case - * - * [MEAX_EXTENT_SIZEx2 - 4k][4k] - * - * The new_size would just be 4k and we'd think we had enough - * outstanding extents for this if we only took one side of the - * split, same goes for the other direction. We need to see if - * the larger size still is the same amount of extents as the - * original size, because if it is we need to add a new - * outstanding extent. But if we split up and the larger size - * is less than the original then we are good to go since we've - * already accounted for the extra extent in our original - * accounting. + * See the explanation in btrfs_merge_extent_hook, the same + * applies here, just in reverse. */ new_size = orig->end - split + 1; - if ((split - orig->start) > new_size) - new_size = split - orig->start; - - num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); - if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) < num_extents) + new_size = split - orig->start; + num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; } @@ -1591,8 +1585,10 @@ static void btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return; - old_size = other->end - other->start + 1; - new_size = old_size + (new->end - new->start + 1); + if (new->start > other->start) + new_size = new->end - other->start + 1; + else + new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= BTRFS_MAX_EXTENT_SIZE) { @@ -1603,13 +1599,32 @@ static void btrfs_merge_extent_hook(struct inode *inode, } /* - * If we grew by another max_extent, just return, we want to keep that - * reserved amount. + * We have to add up either side to figure out how many extents were + * accounted for before we merged into one big extent. If the number of + * extents we accounted for is <= the amount we need for the new range + * then we can return, otherwise drop. Think of it like this + * + * [ 4k][MAX_SIZE] + * + * So we've grown the extent by a MAX_SIZE extent, this would mean we + * need 2 outstanding extents, on one side we have 1 and the other side + * we have 1 so they are == and we can return. But in this case + * + * [MAX_SIZE+4k][MAX_SIZE+4k] + * + * Each range on their own accounts for 2 extents, but merged together + * they are only 3 extents worth of accounting, so we need to drop in + * this case. */ + old_size = other->end - other->start + 1; num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); + old_size = new->end - new->start + 1; + num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) > num_extents) + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -1686,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } + /* For sanity tests */ + if (btrfs_test_is_dummy_root(root)) + return; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); @@ -1741,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); + /* For sanity tests. */ + if (btrfs_test_is_dummy_root(root)) + return; + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); @@ -7213,7 +7236,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - u64 orig_len = len; + u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7225,6 +7248,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, lockstart = start; lockend = start + len - 1; + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transction doesn't get + * confused. + */ + outstanding_extents = current->journal_info; + current->journal_info = NULL; + } + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. @@ -7285,7 +7318,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { int type; - int ret; u64 block_start, orig_start, orig_block_len, ram_bytes; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) @@ -7349,11 +7381,20 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - if (len < orig_len) { + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (*outstanding_extents) { + (*outstanding_extents)--; + } else { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } + + current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); } @@ -7377,6 +7418,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); + if (outstanding_extents) + current->journal_info = outstanding_extents; return ret; } @@ -8076,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + u64 outstanding_extents = 0; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8113,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; + outstanding_extents = div64_u64(count + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); @@ -8125,6 +8179,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { + current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); else if (ret >= 0 && (size_t)ret < count) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 534544e08f76..157cc54fc634 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -452,9 +452,7 @@ void btrfs_get_logged_extents(struct inode *inode, continue; if (entry_end(ordered) <= start) break; - if (!list_empty(&ordered->log_list)) - continue; - if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); atomic_inc(&ordered->refs); @@ -511,8 +509,7 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) - list_add_tail(&ordered->trans_list, &trans->ordered); + list_add_tail(&ordered->trans_list, &trans->ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 97159a8e91d4..058c79eecbfb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1, if (oper1->seq < oper2->seq) return -1; if (oper1->seq > oper2->seq) - return -1; + return 1; if (oper1->ref_root < oper2->ref_root) return -1; if (oper1->ref_root > oper2->ref_root) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fe5857223515..d6033f540cc7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -230,6 +230,7 @@ struct pending_dir_move { u64 parent_ino; u64 ino; u64 gen; + bool is_orphan; struct list_head update_refs; }; @@ -2984,7 +2985,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 ino_gen, u64 parent_ino, struct list_head *new_refs, - struct list_head *deleted_refs) + struct list_head *deleted_refs, + const bool is_orphan) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2999,6 +3001,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; + pm->is_orphan = is_orphan; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -3131,16 +3134,20 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) rmdir_ino = dm->rmdir_ino; free_waiting_dir_move(sctx, dm); - ret = get_first_ref(sctx->parent_root, pm->ino, - &parent_ino, &parent_gen, name); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, parent_ino, parent_gen, - from_path); - if (ret < 0) - goto out; - ret = fs_path_add_path(from_path, name); + if (pm->is_orphan) { + ret = gen_unique_name(sctx, pm->ino, + pm->gen, from_path); + } else { + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + } if (ret < 0) goto out; @@ -3150,7 +3157,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) LIST_HEAD(deleted_refs); ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs); + &pm->update_refs, &deleted_refs, + pm->is_orphan); if (ret < 0) goto out; if (rmdir_ino) { @@ -3283,6 +3291,127 @@ out: return ret; } +/* + * We might need to delay a directory rename even when no ancestor directory + * (in the send root) with a higher inode number than ours (sctx->cur_ino) was + * renamed. This happens when we rename a directory to the old name (the name + * in the parent root) of some other unrelated directory that got its rename + * delayed due to some ancestor with higher number that got renamed. + * + * Example: + * + * Parent snapshot: + * . (ino 256) + * |---- a/ (ino 257) + * | |---- file (ino 260) + * | + * |---- b/ (ino 258) + * |---- c/ (ino 259) + * + * Send snapshot: + * . (ino 256) + * |---- a/ (ino 258) + * |---- x/ (ino 259) + * |---- y/ (ino 257) + * |----- file (ino 260) + * + * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 + * from 'a' to 'x/y' happening first, which in turn depends on the rename of + * inode 259 from 'c' to 'x'. So the order of rename commands the send stream + * must issue is: + * + * 1 - rename 259 from 'c' to 'x' + * 2 - rename 257 from 'a' to 'x/y' + * 3 - rename 258 from 'b' to 'a' + * + * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can + * be done right away and < 0 on error. + */ +static int wait_for_dest_dir_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + u64 left_gen; + u64 right_gen; + int ret = 0; + + if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = parent_ref->dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); + + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + + di = btrfs_match_dir_item_name(sctx->parent_root, path, + parent_ref->name, parent_ref->name_len); + if (!di) { + ret = 0; + goto out; + } + /* + * di_key.objectid has the number of the inode that has a dentry in the + * parent directory with the same name that sctx->cur_ino is being + * renamed to. We need to check if that inode is in the send root as + * well and if it is currently marked as an inode with a pending rename, + * if it is, we need to delay the rename of sctx->cur_ino as well, so + * that it happens after that other inode is renamed. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); + if (di_key.type != BTRFS_INODE_ITEM_KEY) { + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, + &left_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, + &right_gen, NULL, NULL, NULL, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + /* Different inode, no need to delay the rename of sctx->cur_ino */ + if (right_gen != left_gen) { + ret = 0; + goto out; + } + + if (is_waiting_for_move(sctx, di_key.objectid)) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + di_key.objectid, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } +out: + btrfs_free_path(path); + return ret; +} + static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref) { @@ -3349,7 +3478,8 @@ out: sctx->cur_inode_gen, ino, &sctx->new_refs, - &sctx->deleted_refs); + &sctx->deleted_refs, + false); if (!ret) ret = 1; } @@ -3372,6 +3502,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int did_overwrite = 0; int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool can_rename = true; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -3490,12 +3621,22 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { + ret = wait_for_dest_dir_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ - if (is_orphan) { + if (is_orphan && can_rename) { ret = send_rename(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -3503,7 +3644,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = fs_path_copy(valid_path, cur->full_path); if (ret < 0) goto out; - } else { + } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* * Dirs can't be linked, so move it. For moved diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a116b55ce788..054fc0d97131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -911,6 +911,197 @@ out: return ret; } +static int test_extent_accounting(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + BTRFS_I(inode)->root = root; + btrfs_test_inode_set_ops(inode); + + /* [BTRFS_MAX_EXTENT_SIZE] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 1) { + ret = -EINVAL; + test_msg("Miscount, wanted 1, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + EXTENT_DELALLOC | EXTENT_DIRTY | + EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * + * I'm artificially adding 2 to outstanding_extents because in the + * buffered IO case we'd add things up as we go, but I don't feel like + * doing that here, this isn't the interesting case we want to test. + */ + BTRFS_I(inode)->outstanding_extents += 2; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, + (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * Refill the hole again just for good measure, because I thought it + * might fail and I'd rather satisfy my paranoia at this point. + */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents) { + ret = -EINVAL; + test_msg("Miscount, wanted 0, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + ret = 0; +out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + int btrfs_test_inodes(void) { int ret; @@ -924,5 +1115,9 @@ int btrfs_test_inodes(void) if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - return test_hole_first(); + ret = test_hole_first(); + if (ret) + return ret; + test_msg("Running outstanding_extents tests\n"); + return test_extent_accounting(); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7e80f32550a6..8be4278e25e8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1023,17 +1023,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; - bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID); old_root_used = btrfs_root_used(&root->root_item); - btrfs_write_dirty_block_groups(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item) && - (!extent_root || - list_empty(&trans->transaction->dirty_bgs))) + old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); @@ -1044,17 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - if (extent_root) { - ret = btrfs_write_dirty_block_groups(trans, root); - if (ret) - return ret; - } - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; } return 0; @@ -1071,6 +1056,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *next; struct extent_buffer *eb; int ret; @@ -1098,11 +1084,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_setup_space_cache(trans, root); + if (ret) + return ret; + /* run_qgroups might have added some more refs */ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; - +again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); @@ -1115,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = update_cowonly_root(trans, root); if (ret) return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; } + while (!list_empty(dirty_bgs)) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + } + + if (!list_empty(&fs_info->dirty_cowonly_roots)) + goto again; + list_add_tail(&fs_info->extent_root->dirty_list, &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); @@ -1814,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, cur_trans); + if (unlikely(cur_trans->aborted)) + ret = cur_trans->aborted; + btrfs_put_transaction(cur_trans); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9a37f8b39bae..c5b8ba37f88e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1012,7 +1012,7 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)base + cur_offset; + extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name_len = btrfs_inode_extref_name_len(leaf, extref); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd4d1315aaa9..8222f6f74147 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4903,10 +4903,17 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) { struct btrfs_bio *bbio = kzalloc( + /* the size of the btrfs_bio */ sizeof(struct btrfs_bio) + + /* plus the variable array for the stripes */ sizeof(struct btrfs_bio_stripe) * (total_stripes) + + /* plus the variable array for the tgt dev */ sizeof(int) * (real_stripes) + - sizeof(u64) * (real_stripes), + /* + * plus the raid_map, which includes both the tgt dev + * and the stripes + */ + sizeof(u64) * (total_stripes), GFP_NOFS); if (!bbio) return NULL; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 47b19465f0dc..883b93623bc5 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -111,6 +111,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans, name, name_len, -1); if (!di && (flags & XATTR_REPLACE)) ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); else if (di) ret = btrfs_delete_one_dir_name(trans, root, path, di); goto out; @@ -127,10 +129,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans, ASSERT(mutex_is_locked(&inode->i_mutex)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); - if (!di) { + if (!di) ret = -ENODATA; + else if (IS_ERR(di)) + ret = PTR_ERR(di); + if (ret) goto out; - } btrfs_release_path(path); di = NULL; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 4ac7445e6ec7..aa0dc2573374 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -1,6 +1,9 @@ /* * fs/cifs/cifsencrypt.c * + * Encryption and hashing operations relating to NTLM, NTLMv2. See MS-NLMP + * for more detailed information + * * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * @@ -515,7 +518,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, __func__); return rc; } - } else if (ses->serverName) { + } else { + /* We use ses->serverName if no domain name available */ len = strlen(ses->serverName); server = kmalloc(2 + (len * 2), GFP_KERNEL); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d3aa999ab785..480cf9c81d50 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1599,6 +1599,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, pr_warn("CIFS: username too long\n"); goto cifs_parse_mount_err; } + + kfree(vol->username); vol->username = kstrdup(string, GFP_KERNEL); if (!vol->username) goto cifs_parse_mount_err; @@ -1700,6 +1702,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } + kfree(vol->domainname); vol->domainname = kstrdup(string, GFP_KERNEL); if (!vol->domainname) { pr_warn("CIFS: no memory for domainname\n"); @@ -1731,6 +1734,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } if (strncasecmp(string, "default", 7) != 0) { + kfree(vol->iocharset); vol->iocharset = kstrdup(string, GFP_KERNEL); if (!vol->iocharset) { @@ -2913,8 +2917,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * calling name ends in null (byte 16) from old smb * convention. */ - if (server->workstation_RFC1001_name && - server->workstation_RFC1001_name[0] != 0) + if (server->workstation_RFC1001_name[0] != 0) rfc1002mangle(ses_init_buf->trailer. session_req.calling_name, server->workstation_RFC1001_name, @@ -3692,6 +3695,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, #endif /* CIFS_WEAK_PW_HASH */ rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr, nls_codepage); + if (rc) { + cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n", + __func__, rc); + cifs_buf_release(smb_buffer); + return rc; + } bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a94b3e673182..ca30c391a894 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1823,6 +1823,7 @@ refind_writable: cifsFileInfo_put(inv_file); spin_lock(&cifs_file_list_lock); ++refind; + inv_file = NULL; goto refind_writable; } } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 2d4f37235ed0..3e126d7bb2ea 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -771,6 +771,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_buf_release(srchinf->ntwrk_buf_start); } kfree(srchinf); + if (rc) + goto cgii_exit; } else goto cgii_exit; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 689f035915cf..22dfdf17d065 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -322,7 +322,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) /* return pointer to beginning of data area, ie offset from SMB start */ if ((*off != 0) && (*len != 0)) - return hdr->ProtocolId + *off; + return (char *)(&hdr->ProtocolId[0]) + *off; else return NULL; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 96b5d40a2ece..eab05e1aa587 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -684,7 +684,8 @@ smb2_clone_range(const unsigned int xid, /* No need to change MaxChunks since already set to 1 */ chunk_sizes_updated = true; - } + } else + goto cchunk_out; } cchunk_out: diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 3417340bf89e..65cd7a84c8bc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1218,7 +1218,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; struct TCP_Server_Info *server; - struct cifs_ses *ses = tcon->ses; + struct cifs_ses *ses; struct kvec iov[2]; int resp_buftype; int num_iovecs; @@ -1233,6 +1233,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (plen) *plen = 0; + if (tcon) + ses = tcon->ses; + else + return -EIO; + if (ses && (ses->server)) server = ses->server; else @@ -1296,14 +1301,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; if ((rc != 0) && (rc != -EINVAL)) { - if (tcon) - cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; } else if (rc == -EINVAL) { if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) && (opcode != FSCTL_SRV_COPYCHUNK)) { - if (tcon) - cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; } } @@ -1629,7 +1632,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); - if ((rc != 0) && tcon) + if (rc != 0) cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); free_rsp_buf(resp_buftype, iov[0].iov_base); @@ -2114,7 +2117,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct kvec iov[2]; int rc = 0; int len; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; unsigned char *bufptr; struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 90d1882b306f..5ba029e627cc 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -124,7 +124,7 @@ ecryptfs_get_key_payload_data(struct key *key) } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 -#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 +#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64 #define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ #define ECRYPTFS_SALT_BYTES 2 @@ -237,7 +237,7 @@ struct ecryptfs_crypt_stat { struct crypto_ablkcipher *tfm; struct crypto_hash *hash_tfm; /* Crypto context for generating * the initialization vectors */ - unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; struct list_head keysig_list; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index b07731e68c0b..fd39bad6f1bd 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -303,9 +303,22 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOTTY; - if (lower_file->f_op->unlocked_ioctl) + if (!lower_file->f_op->unlocked_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + case FS_IOC_GETVERSION: + case FS_IOC_SETVERSION: rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #ifdef CONFIG_COMPAT @@ -315,9 +328,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (lower_file->f_op->compat_ioctl) + if (!lower_file->f_op->compat_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC32_GETFLAGS: + case FS_IOC32_SETFLAGS: + case FS_IOC32_GETVERSION: + case FS_IOC32_SETVERSION: rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #endif diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 917bd5c9776a..6bd67e2011f0 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -891,7 +891,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { struct blkcipher_desc desc; char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; char iv[ECRYPTFS_MAX_IV_BYTES]; - char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; }; /** diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1895d60f4122..c095d3264259 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -407,7 +407,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); - BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); + BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e907052eeadb..32a8bbd7a9ad 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,6 +53,18 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * If an inode is constantly having its pages dirtied, but then the + * updates stop dirtytime_expire_interval seconds in the past, it's + * possible for the worst case time between when an inode has its + * timestamps updated and when they finally get written out to be two + * dirtytime_expire_intervals. We set the default to 12 hours (in + * seconds), which means most of the time inodes will have their + * timestamps written to disk after 12 hours, but in the worst case a + * few inodes might not their timestamps updated for 24 hours. + */ +unsigned int dirtytime_expire_interval = 12 * 60 * 60; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -275,8 +287,8 @@ static int move_expired_inodes(struct list_head *delaying_queue, if ((flags & EXPIRE_DIRTY_ATIME) == 0) older_than_this = work->older_than_this; - else if ((work->reason == WB_REASON_SYNC) == 0) { - expire_time = jiffies - (HZ * 86400); + else if (!work->for_sync) { + expire_time = jiffies - (dirtytime_expire_interval * HZ); older_than_this = &expire_time; } while (!list_empty(delaying_queue)) { @@ -458,6 +470,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, */ redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ @@ -505,12 +518,17 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) && - (inode->i_state & I_DIRTY_TIME)) || - (inode->i_state & I_DIRTY_TIME_EXPIRED)) { - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; - trace_writeback_lazytime(inode); - } + if (inode->i_state & I_DIRTY_TIME) { + if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || + unlikely(time_after(jiffies, + (inode->dirtied_time_when + + dirtytime_expire_interval * HZ)))) { + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; + trace_writeback_lazytime(inode); + } + } else + inode->i_state &= ~I_DIRTY_TIME_EXPIRED; inode->i_state &= ~dirty; /* @@ -1131,6 +1149,56 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_unlock(); } +/* + * Wake up bdi's periodically to make sure dirtytime inodes gets + * written back periodically. We deliberately do *not* check the + * b_dirtytime list in wb_has_dirty_io(), since this would cause the + * kernel to be constantly waking up once there are any dirtytime + * inodes on the system. So instead we define a separate delayed work + * function which gets called much more rarely. (By default, only + * once every 12 hours.) + * + * If there is any other write activity going on in the file system, + * this function won't be necessary. But if the only thing that has + * happened on the file system is a dirtytime inode caused by an atime + * update, we need this infrastructure below to make sure that inode + * eventually gets pushed out to disk. + */ +static void wakeup_dirtytime_writeback(struct work_struct *w); +static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); + +static void wakeup_dirtytime_writeback(struct work_struct *w) +{ + struct backing_dev_info *bdi; + + rcu_read_lock(); + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (list_empty(&bdi->wb.b_dirty_time)) + continue; + bdi_wakeup_thread(bdi); + } + rcu_read_unlock(); + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); +} + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + return 0; +} +__initcall(start_dirtytime_writeback); + +int dirtytime_interval_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + mod_delayed_work(system_wq, &dirtytime_work, 0); + return ret; +} + static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1269,8 +1337,13 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, dirtytime ? - &bdi->wb.b_dirty_time : &bdi->wb.b_dirty); + if (dirtytime) + inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + else + list_move(&inode->i_wb_list, + &bdi->wb.b_dirty_time); spin_unlock(&bdi->wb.list_lock); trace_writeback_dirty_inode_enqueue(inode); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ed19a7d622fa..39706c57ad3c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -890,8 +890,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newpage = buf->page; - if (WARN_ON(!PageUptodate(newpage))) - return -EIO; + if (!PageUptodate(newpage)) + SetPageUptodate(newpage); ClearPageMappedToDisk(newpage); @@ -1353,6 +1353,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, return err; } +static int fuse_dev_open(struct inode *inode, struct file *file) +{ + /* + * The fuse device's file's private_data is used to hold + * the fuse_conn(ection) when it is mounted, and is used to + * keep track of whether the file has been mounted already. + */ + file->private_data = NULL; + return 0; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -1797,6 +1808,9 @@ copy_finish: static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { + /* Don't try to move pages (yet) */ + cs->move_pages = 0; + switch (code) { case FUSE_NOTIFY_POLL: return fuse_notify_poll(fc, size, cs); @@ -2217,6 +2231,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, + .open = fuse_dev_open, .llseek = no_llseek, .read = do_sync_read, .aio_read = fuse_dev_read, diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 6e560d56094b..754fdf8c6356 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -168,9 +171,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -370,6 +370,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd, hfs_find_rec_by_key); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b684e8a132e6..2bacb9988566 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, goto out_free; } + of->event = atomic_read(&of->kn->attr.open->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, *ppos); diff --git a/fs/locks.c b/fs/locks.c index 365c82e1b3a9..40bc384728c0 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1388,9 +1388,8 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker) int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; - struct file_lock *new_fl; struct file_lock_context *ctx = inode->i_flctx; - struct file_lock *fl; + struct file_lock *new_fl, *fl, *tmp; unsigned long break_time; int want_write = (mode & O_ACCMODE) != O_RDONLY; LIST_HEAD(dispose); @@ -1420,7 +1419,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) break_time++; /* so that 0 means no break time */ } - list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { if (!leases_conflict(fl, new_fl)) continue; if (want_write) { @@ -1665,7 +1664,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr } if (my_fl != NULL) { - error = lease->fl_lmops->lm_change(my_fl, arg, &dispose); + lease = my_fl; + error = lease->fl_lmops->lm_change(lease, arg, &dispose); if (error) goto out; goto out_setup; @@ -1727,7 +1727,7 @@ static int generic_delete_lease(struct file *filp, void *owner) break; } } - trace_generic_delete_lease(inode, fl); + trace_generic_delete_lease(inode, victim); if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f9f4845db989..19874151e95c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -433,7 +433,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat static bool nfs_client_init_is_complete(const struct nfs_client *clp) { - return clp->cl_cons_state != NFS_CS_INITING; + return clp->cl_cons_state <= NFS_CS_READY; } int nfs_wait_client_init_complete(const struct nfs_client *clp) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a1f0685b42ff..a6ad68865880 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -181,8 +181,8 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); spin_unlock(&delegation->lock); - put_rpccred(oldcred); rcu_read_unlock(); + put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); } else { /* We appear to have raced with a delegation return. */ @@ -370,7 +370,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, + if (test_and_set_bit(NFS_DELEGATION_RETURNING, + &old_delegation->flags)) + goto out; + freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp); if (freeme == NULL) goto out; @@ -433,6 +436,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + goto out; if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { @@ -444,6 +449,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) ret = true; spin_unlock(&delegation->lock); } +out: return ret; } @@ -471,14 +477,20 @@ restart: super_list) { if (!nfs_delegation_need_return(delegation)) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); + nfs_sb_deactive(server->super); if (!err) goto restart; set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); @@ -809,19 +821,30 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; - delegation = nfs_detach_delegation(NFS_I(inode), - delegation, server); + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); - - if (delegation != NULL) - nfs_free_delegation(delegation); + if (delegation != NULL) { + delegation = nfs_detach_delegation(NFS_I(inode), + delegation, server); + if (delegation != NULL) + nfs_free_delegation(delegation); + } iput(inode); + nfs_sb_deactive(server->super); goto restart; } } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9b0c55cb2a2e..c19e16f0b2d0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -408,14 +408,22 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc, return 0; } +/* Match file and dirent using either filehandle or fileid + * Note: caller is responsible for checking the fsid + */ static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { + struct nfs_inode *nfsi; + if (dentry->d_inode == NULL) goto different; - if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) - goto different; - return 1; + + nfsi = NFS_I(dentry->d_inode); + if (entry->fattr->fileid == nfsi->fileid) + return 1; + if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) + return 1; different: return 0; } @@ -469,6 +477,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct inode *inode; int status; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) + return; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -479,6 +491,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) dentry = d_lookup(parent, &filename); if (dentry != NULL) { + /* Is there a mountpoint here? If so, just exit */ + if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, + &entry->fattr->fsid)) + goto out; if (nfs_same_file(dentry, entry)) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(dentry->d_inode, entry->fattr); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 94712fc781fa..e679d24c39d3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -178,7 +178,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); + result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) @@ -199,7 +199,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) @@ -372,6 +372,10 @@ start: nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; + /* + * Wait for O_DIRECT to complete + */ + nfs_inode_dio_wait(mapping->host); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -619,6 +623,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); + wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + lock_page(page); mapping = page_file_mapping(page); if (mapping != inode->i_mapping) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 83107be3dd01..d42dff6d5e98 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -556,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr); * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. + * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { @@ -565,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) if (err) goto out; - spin_lock(&inode->i_lock); i_size_write(inode, offset); /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); + spin_lock(&inode->i_lock); out: return err; } @@ -585,10 +586,15 @@ out: * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ -void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, + struct nfs_fattr *fattr) { + /* Barrier: bump the attribute generation count. */ + nfs_fattr_set_barrier(fattr); + + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { - spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -600,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_gid = attr->ia_gid; nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); - spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } + nfs_update_inode(inode, fattr); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); @@ -1028,6 +1035,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { + unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; @@ -1060,11 +1068,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) } /** - * nfs_revalidate_mapping - Revalidate the pagecache + * __nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping + * @may_lock - take inode->i_mutex? */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static int __nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping, + bool may_lock) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1113,7 +1124,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); + if (may_lock) { + mutex_lock(&inode->i_mutex); + ret = nfs_invalidate_mapping(inode, mapping); + mutex_unlock(&inode->i_mutex); + } else + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1123,6 +1139,29 @@ out: return ret; } +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, false); +} + +/** + * nfs_revalidate_mapping_protected - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * + * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex + * while invalidating the mapping. + */ +int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, true); +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1231,13 +1270,6 @@ static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fat return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; } -static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) - return 0; - return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); -} - static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) @@ -1249,6 +1281,7 @@ unsigned long nfs_inc_attr_generation_counter(void) { return atomic_long_inc_return(&nfs_attr_generation_counter); } +EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); void nfs_fattr_init(struct nfs_fattr *fattr) { @@ -1260,6 +1293,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_fattr_init); +/** + * nfs_fattr_set_barrier + * @fattr: attributes + * + * Used to set a barrier after an attribute was updated. This + * barrier ensures that older attributes from RPC calls that may + * have raced with our update cannot clobber these new values. + * Note that you are still responsible for ensuring that other + * operations which change the attribute on the server do not + * collide. + */ +void nfs_fattr_set_barrier(struct nfs_fattr *fattr) +{ + fattr->gencount = nfs_inc_attr_generation_counter(); +} + struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; @@ -1370,7 +1419,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || nfs_ctime_need_update(inode, fattr) || - nfs_size_need_update(inode, fattr) || ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } @@ -1460,6 +1508,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) int status; spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); @@ -1468,7 +1517,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** - * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode - pointer to inode * @fattr - updated attributes * @@ -1478,11 +1527,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); * * This function is mainly designed to be used by the ->write_done() functions. */ -int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int status; - spin_lock(&inode->i_lock); /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { @@ -1514,6 +1562,27 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); + status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } @@ -1715,6 +1784,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; + /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { @@ -1722,6 +1792,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } + /* Set the barrier to be more recent than this fattr */ + if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + nfsi->attr_gencount = fattr->gencount; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b802fb3a2d99..9e6475bc5ba2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -459,6 +459,7 @@ void nfs_mark_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 78e557c3ab87..1f11d2533ee4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -834,7 +834,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 2a932fdc57cb..53852a4bd88b 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1987,6 +1987,11 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (entry->fattr->fileid != entry->ino) { + entry->fattr->mounted_on_fileid = entry->ino; + entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID; + } + /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8646af9b11d2..86d6214ea022 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -621,6 +621,9 @@ int nfs41_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + if (pos->rpc_ops != new->rpc_ops) continue; @@ -639,10 +642,6 @@ int nfs41_walk_client_list(struct nfs_client *new, prev = pos; status = nfs_wait_client_init_complete(pos); - if (pos->cl_cons_state == NFS_CS_SESSION_INITING) { - nfs4_schedule_lease_recovery(pos); - status = nfs4_wait_clnt_recover(pos); - } spin_lock(&nn->nfs_client_lock); if (status < 0) break; @@ -668,7 +667,7 @@ int nfs41_walk_client_list(struct nfs_client *new, */ if (!nfs4_match_client_owner_id(pos, new)) continue; - +found: atomic_inc(&pos->cl_count); *result = pos; status = 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 88180ac5ea0e..627f37c44456 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -901,6 +901,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); dir->i_version = cinfo->after; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); } @@ -1552,6 +1553,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; + opendata->o_arg.share_access = nfs4_map_atomic_open_share( + NFS_SB(opendata->dentry->d_sb), + fmode, 0); memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); @@ -2413,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr, sattr, state, label, olabel); if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } @@ -2651,7 +2655,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_BAD_STATEID: case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, - &state->stateid)) { + &state->open_stateid)) { rpc_restart_call_prepare(task); goto out_release; } @@ -2687,7 +2691,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid); + nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); /* Calculate the change in open mode */ calldata->arg.fmode = 0; if (state->n_rdwr == 0) { @@ -3288,7 +3292,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); if (status == 0) { - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); nfs_setsecurity(inode, fattr, label); } nfs4_label_free(label); @@ -4234,7 +4238,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); - nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr); + nfs_writeback_update_inode(hdr); } return 0; } @@ -6893,9 +6897,13 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status == 0) { clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) + clp->cl_exchange_flags = res.flags; + /* Client ID is not confirmed */ + if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); clp->cl_seqid = res.seqid; + } kfree(clp->cl_serverowner); clp->cl_serverowner = res.server_owner; @@ -7227,6 +7235,9 @@ static void nfs4_update_session(struct nfs4_session *session, struct nfs41_create_session_res *res) { nfs4_copy_sessionid(&session->sess_id, &res->sessionid); + /* Mark client id and session as being confirmed */ + session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state); session->flags = res->flags; memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs)); if (res->flags & SESSION4_BACK_CHAN) @@ -7322,8 +7333,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, dprintk("--> nfs4_proc_destroy_session\n"); /* session is still being setup */ - if (session->clp->cl_cons_state != NFS_CS_READY) - return status; + if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) + return 0; status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_destroy_session(session->clp, status); diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index fc46c7455898..e3ea2c5324d6 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -70,6 +70,7 @@ struct nfs4_session { enum nfs4_session_state { NFS4_SESSION_INITING, + NFS4_SESSION_ESTABLISHED, }; extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5ad908e9ce9c..f95e3b58bbc3 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -346,9 +346,23 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, status = nfs4_proc_exchange_id(clp, cred); if (status != NFS4_OK) return status; - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - return nfs41_walk_client_list(clp, result, cred); + status = nfs41_walk_client_list(clp, result, cred); + if (status < 0) + return status; + if (clp != *result) + return 0; + + /* Purge state if the client id was established in a prior instance */ + if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R) + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + else + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_manager(clp); + status = nfs_wait_client_init_complete(clp); + if (status < 0) + nfs_put_client(clp); + return status; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b09cc23d6f43..c63189acd052 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = hdr->inode; - if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 595d81e354d1..849ed784d6ac 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1377,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode) return 0; } +static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, + struct nfs_fattr *fattr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return; + if (argp->offset + resp->count != fattr->size) + return; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + return; + /* Set attribute barrier */ + nfs_fattr_set_barrier(fattr); +} + +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) +{ + struct nfs_fattr *fattr = hdr->res.fattr; + struct inode *inode = hdr->inode; + + if (fattr == NULL) + return; + spin_lock(&inode->i_lock); + nfs_writeback_check_extend(hdr, fattr); + nfs_post_op_update_inode_force_wcc_locked(inode, fattr); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_writeback_update_inode); + /* * This function is called when the WRITE call is complete. */ diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index cdbc78c72542..03d647bf195d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -137,7 +137,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, seg->offset = iomap.offset; seg->length = iomap.length; - dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es); + dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); return 0; out_error: diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 9da89fddab33..9aa2796da90d 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -122,19 +122,19 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, p = xdr_decode_hyper(p, &bex.foff); if (bex.foff & (block_size - 1)) { - dprintk("%s: unaligned offset %lld\n", + dprintk("%s: unaligned offset 0x%llx\n", __func__, bex.foff); goto fail; } p = xdr_decode_hyper(p, &bex.len); if (bex.len & (block_size - 1)) { - dprintk("%s: unaligned length %lld\n", + dprintk("%s: unaligned length 0x%llx\n", __func__, bex.foff); goto fail; } p = xdr_decode_hyper(p, &bex.soff); if (bex.soff & (block_size - 1)) { - dprintk("%s: unaligned disk offset %lld\n", + dprintk("%s: unaligned disk offset 0x%llx\n", __func__, bex.soff); goto fail; } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 3c1bfa155571..6904213a4363 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -118,7 +118,7 @@ void nfsd4_setup_layout_type(struct svc_export *exp) { struct super_block *sb = exp->ex_path.mnt->mnt_sb; - if (exp->ex_flags & NFSEXP_NOPNFS) + if (!(exp->ex_flags & NFSEXP_PNFS)) return; if (sb->s_export_op->get_uuid && @@ -440,15 +440,14 @@ nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg, list_move_tail(&lp->lo_perstate, reaplist); return; } - end = seg->offset; + lo->offset = layout_end(seg); } else { /* retain the whole layout segment on a split. */ if (layout_end(seg) < end) { dprintk("%s: split not supported\n", __func__); return; } - - lo->offset = layout_end(seg); + end = seg->offset; } layout_update_len(lo, end); @@ -513,6 +512,9 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp, spin_lock(&clp->cl_lock); list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { + if (ls->ls_layout_type != lrp->lr_layout_type) + continue; + if (lrp->lr_return_type == RETURN_FSID && !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle, &cstate->current_fh.fh_handle)) @@ -587,7 +589,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); - nfsd4_cb_layout_fail(ls); + trace_layout_recall_fail(&ls->ls_stid.sc_stateid); printk(KERN_WARNING "nfsd: client %s failed to respond to layout recall. " diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d30bea8d0277..92b9d97aff4f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1237,8 +1237,8 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp); gdp->gd_notify_types &= ops->notify_types; - exp_put(exp); out: + exp_put(exp); return nfserr; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f6b2a09f793f..8ba1d888f1e6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1638,7 +1638,7 @@ __destroy_client(struct nfs4_client *clp) nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_revoked)) { - dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); nfs4_put_stid(&dp->dl_stid); } @@ -3221,7 +3221,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, } else nfs4_free_openowner(&oo->oo_owner); spin_unlock(&clp->cl_lock); - return oo; + return ret; } static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { @@ -5062,7 +5062,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, } else nfs4_free_lockowner(&lo->lo_owner); spin_unlock(&clp->cl_lock); - return lo; + return ret; } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index df5e66caf100..5fb7e78169a6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1562,7 +1562,11 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, p = xdr_decode_hyper(p, &lgp->lg_seg.offset); p = xdr_decode_hyper(p, &lgp->lg_seg.length); p = xdr_decode_hyper(p, &lgp->lg_minlength); - nfsd4_decode_stateid(argp, &lgp->lg_sid); + + status = nfsd4_decode_stateid(argp, &lgp->lg_sid); + if (status) + return status; + READ_BUF(4); lgp->lg_maxcount = be32_to_cpup(p++); @@ -1580,7 +1584,11 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, p = xdr_decode_hyper(p, &lcp->lc_seg.offset); p = xdr_decode_hyper(p, &lcp->lc_seg.length); lcp->lc_reclaim = be32_to_cpup(p++); - nfsd4_decode_stateid(argp, &lcp->lc_sid); + + status = nfsd4_decode_stateid(argp, &lcp->lc_sid); + if (status) + return status; + READ_BUF(4); lcp->lc_newoffset = be32_to_cpup(p++); if (lcp->lc_newoffset) { @@ -1628,7 +1636,11 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, READ_BUF(16); p = xdr_decode_hyper(p, &lrp->lr_seg.offset); p = xdr_decode_hyper(p, &lrp->lr_seg.length); - nfsd4_decode_stateid(argp, &lrp->lr_sid); + + status = nfsd4_decode_stateid(argp, &lrp->lr_sid); + if (status) + return status; + READ_BUF(4); lrp->lrf_body_len = be32_to_cpup(p++); if (lrp->lrf_body_len > 0) { @@ -4123,7 +4135,7 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr_resource; *p++ = cpu_to_be32(lrp->lrs_present); if (lrp->lrs_present) - nfsd4_encode_stateid(xdr, &lrp->lr_sid); + return nfsd4_encode_stateid(xdr, &lrp->lr_sid); return nfs_ok; } #endif /* CONFIG_NFSD_PNFS */ diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 83a9694ec485..46ec934f5dee 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -165,13 +165,17 @@ int nfsd_reply_cache_init(void) { unsigned int hashsize; unsigned int i; + int status = 0; max_drc_entries = nfsd_cache_size_limit(); atomic_set(&num_drc_entries, 0); hashsize = nfsd_hashsize(max_drc_entries); maskbits = ilog2(hashsize); - register_shrinker(&nfsd_reply_cache_shrinker); + status = register_shrinker(&nfsd_reply_cache_shrinker); + if (status) + return status; + drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 0, 0, NULL); if (!drc_slab) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b2e3ff347620..ecdbae19a766 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -31,6 +31,8 @@ #include "alloc.h" #include "dat.h" +static void __nilfs_btree_init(struct nilfs_bmap *bmap); + static struct nilfs_btree_path *nilfs_btree_alloc_path(void) { struct nilfs_btree_path *path; @@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, return ret; } +/** + * nilfs_btree_root_broken - verify consistency of btree root node + * @node: btree root node to be examined + * @ino: inode number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, + unsigned long ino) +{ + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level > NILFS_BTREE_LEVEL_MAX || + nchildren < 0 || + nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { + pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", + ino, level, flags, nchildren); + ret = 1; + } + return ret; +} + int nilfs_btree_broken_node_block(struct buffer_head *bh) { int ret; @@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, /* convert and insert */ dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; - nilfs_btree_init(btree); + __nilfs_btree_init(btree); if (nreq != NULL) { nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); @@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_gather_data = NULL, }; -int nilfs_btree_init(struct nilfs_bmap *bmap) +static void __nilfs_btree_init(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops; bmap->b_nchildren_per_block = NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); - return 0; +} + +int nilfs_btree_init(struct nilfs_bmap *bmap) +{ + int ret = 0; + + __nilfs_btree_init(bmap); + + if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), + bmap->b_inode->i_ino)) + ret = -EIO; + return ret; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 469086b9f99b..0c3f303baf32 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; + int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); @@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, brelse(ii->i_bh); ii->i_bh = NULL; list_del_init(&ii->i_dirty); - if (!ii->vfs_inode.i_nlink) { + if (!ii->vfs_inode.i_nlink || during_mount) { /* - * Defer calling iput() to avoid a deadlock - * over I_SYNC flag for inodes with i_nlink == 0 + * Defer calling iput() to avoid deadlocks if + * i_nlink == 0 or mount is not yet finished. */ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); defer_iput = true; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 9a66ff79ff27..d2f97ecca6a5 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -143,7 +143,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; - if (event_mask & marks_mask & ~marks_ignored_mask) + if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & + ~marks_ignored_mask) return true; return false; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8490c64d34fe..460c6c37e683 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) { - if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) return 1; return 0; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 20e37a3ed26f..db64ce2d4667 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -102,11 +102,11 @@ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ - | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ + | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The @@ -179,6 +179,11 @@ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 /* + * Append Direct IO support + */ +#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 + +/* * backup superblock flag is used to indicate that this volume * has backup superblocks. */ @@ -200,10 +205,6 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 -/* - * Append Direct IO support - */ -#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008 /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b90952f528b1..5f0d1993e6e3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -529,8 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ufs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && - (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY))) + if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) return -EROFS; return 0; @@ -615,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: + pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; } } + + /* Workdir is useless in non-upper mount */ + if (!config->upperdir && config->workdir) { + pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + return 0; } @@ -837,7 +846,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; if (ufs->config.upperdir) { - /* FIXME: workdir is not needed for a R/O mount */ if (!ufs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); goto out_free_config; @@ -847,6 +855,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_free_config; + /* Upper fs should not be r/o */ + if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out_put_upperpath; + } + err = ovl_mount_dir(ufs->config.workdir, &workpath); if (err) goto out_put_upperpath; @@ -869,8 +884,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); - if (stacklen > OVL_MAX_STACK) + if (stacklen > OVL_MAX_STACK) { + pr_err("overlayfs: too many lower directries, limit is %d\n", + OVL_MAX_STACK); goto out_free_lowertmp; + } else if (!ufs->config.upperdir && stacklen == 1) { + pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + goto out_free_lowertmp; + } stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); if (!stack) @@ -932,8 +953,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->numlower++; } - /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */ - if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)) + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; sb->s_d_op = &ovl_dentry_operations; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 956b75d61809..6dee68d013ff 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1325,6 +1325,9 @@ out: static int pagemap_open(struct inode *inode, struct file *file) { + /* do not disclose physical addresses: attack vector */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " "to stop being page-shift some time soon. See the " "linux/Documentation/vm/pagemap.txt for details.\n"); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ce615d12fb44..a2e1cb8a568b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -397,7 +397,8 @@ STATIC int /* error (positive) */ xfs_zero_last_block( struct xfs_inode *ip, xfs_fsize_t offset, - xfs_fsize_t isize) + xfs_fsize_t isize, + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); @@ -425,6 +426,7 @@ xfs_zero_last_block( zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; + *did_zeroing = true; return xfs_iozero(ip, isize, zero_len); } @@ -443,7 +445,8 @@ int /* error (positive) */ xfs_zero_eof( struct xfs_inode *ip, xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + xfs_fsize_t isize, /* current inode size */ + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_zero_fsb; @@ -465,7 +468,7 @@ xfs_zero_eof( * We only zero a part of that block so it is handled specially. */ if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize); + error = xfs_zero_last_block(ip, offset, isize, did_zeroing); if (error) return error; } @@ -525,6 +528,7 @@ xfs_zero_eof( if (error) return error; + *did_zeroing = true; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); } @@ -567,13 +571,15 @@ restart: * having to redo all checks before. */ if (*pos > i_size_read(inode)) { + bool zero = false; + if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); goto restart; } - error = xfs_zero_eof(ip, *pos, i_size_read(inode)); + error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); if (error) return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index daafa1f6d260..6163767aa856 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2867,6 +2867,10 @@ xfs_rename( * Handle RENAME_EXCHANGE flags */ if (flags & RENAME_EXCHANGE) { + if (target_ip == NULL) { + error = -EINVAL; + goto error_return; + } error = xfs_cross_rename(tp, src_dp, src_name, src_ip, target_dp, target_name, target_ip, &free_list, &first_block, spaceres); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 86cd6b39bed7..a1cd55f3f351 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -384,10 +384,11 @@ enum xfs_prealloc_flags { XFS_PREALLOC_INVISIBLE = (1 << 4), }; -int xfs_update_prealloc_flags(struct xfs_inode *, - enum xfs_prealloc_flags); -int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); -int xfs_iozero(struct xfs_inode *, loff_t, size_t); +int xfs_update_prealloc_flags(struct xfs_inode *ip, + enum xfs_prealloc_flags flags); +int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, + xfs_fsize_t isize, bool *did_zeroing); +int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); #define IHOLD(ip) \ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d919ad7b16bf..e53a90331422 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -751,6 +751,7 @@ xfs_setattr_size( int error; uint lock_flags = 0; uint commit_flags = 0; + bool did_zeroing = false; trace_xfs_setattr(ip); @@ -794,20 +795,16 @@ xfs_setattr_size( return error; /* - * Now we can make the changes. Before we join the inode to the - * transaction, take care of the part of the truncation that must be - * done without the inode lock. This needs to be done before joining - * the inode to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. + * File data changes must be complete before we start the transaction to + * modify the inode. This needs to be done before joining the inode to + * the transaction because the inode cannot be unlocked once it is a + * part of the transaction. + * + * Start with zeroing any data block beyond EOF that we may expose on + * file extension. */ if (newsize > oldsize) { - /* - * Do the first part of growing a file: zero any data in the - * last block that is beyond the old EOF. We need to do this - * before the inode is joined to the transaction to modify - * i_size. - */ - error = xfs_zero_eof(ip, newsize, oldsize); + error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); if (error) return error; } @@ -817,23 +814,18 @@ xfs_setattr_size( * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files - * problem. - * - * Only flush from the on disk size to the smaller of the in memory - * file size or the new size as that's the range we really care about - * here and prevents waiting for other data not within the range we - * care about here. + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. */ - if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + if (newsize > ip->i_d.di_size && + (oldsize != ip->i_d.di_size || did_zeroing)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* - * Wait for all direct I/O to complete. - */ + /* Now wait for all direct I/O to complete. */ inode_dio_wait(inode); /* diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 4b33ef112400..365dd57ea760 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -300,8 +300,10 @@ xfs_fs_commit_blocks( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) + if (error) { + xfs_trans_cancel(tp, 0); goto out_drop_iolock; + } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 53cc2aaf8d2b..fbbb9e62e274 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -836,6 +836,11 @@ xfs_qm_reset_dqcounts( */ xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, "xfs_quotacheck"); + /* + * Reset type in case we are reusing group quota file for + * project quotas or vice versa + */ + ddq->d_flags = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; |