diff options
Diffstat (limited to 'fs')
125 files changed, 4779 insertions, 1421 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8d7f3e69ae29..7f6c67703195 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d) int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) { - dentry_unhash(d); return v9fs_remove(i, d, 1); } @@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct p9_fid *newdirfid; struct p9_wstat wstat; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - P9_DPRINTK(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 03330e2e390c..e3e9efc1fdd8 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) dentry->d_inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name); - dentry_unhash(dentry); - return affs_remove_header(dentry); } @@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh = NULL; int retval; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 2c4e05160042..20c106f24927 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) _enter("{%x:%u},{%s}", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - dentry_unhash(dentry); - ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) goto error; @@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct key *key; int ret; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - vnode = AFS_FS_I(old_dentry->d_inode); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); diff --git a/fs/attr.c b/fs/attr.c index 91dbe2a107f2..caf2aa521e2b 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return -EPERM; } + if ((ia_valid & ATTR_MODE)) { + mode_t amode = attr->ia_mode; + /* Flag setting protected by i_mutex */ + if (is_sxid(amode)) + inode->i_flags &= ~S_NOSEC; + } + now = current_fs_time(inode->i_sb); attr->ia_ctime = now; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 87d95a8cddbc..f55ae23b137e 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EACCES; - dentry_unhash(dentry); - if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index c7d1d06b0483..b14cebfd9047 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct bfs_sb_info *info; int error = -ENOENT; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - old_bh = new_bh = NULL; old_inode = old_dentry->d_inode; if (S_ISDIR(old_inode->i_mode)) @@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * @offset: vec entry offset * * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block - * device limitations. The target block device must allow bio's - * smaller than PAGE_SIZE, so it is always possible to add a single - * page to an empty bio. This should only be used by REQ_PC bios. + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. */ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset) @@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page); * @offset: vec entry offset * * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block - * device limitations. The target block device must allow bio's - * smaller than PAGE_SIZE, so it is always possible to add a single - * page to an empty bio. + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) diff --git a/fs/block_dev.c b/fs/block_dev.c index 1f2b19978333..1a2421f908f0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1272,8 +1272,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) * individual writeable reference is too fragile given the * way @mode is used in blkdev_get/put(). */ - if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) && - !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; disk_block_events(disk); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 93b1aa932014..52d7eca8c7bf 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -121,9 +121,6 @@ struct btrfs_inode { */ u64 index_cnt; - /* the start of block group preferred for allocations. */ - u64 block_group; - /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b0e18d986e0a..d84089349c82 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -43,8 +43,6 @@ struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); - if (path) - path->reada = 1; return path; } @@ -1224,6 +1222,7 @@ static void reada_for_search(struct btrfs_root *root, u64 search; u64 target; u64 nread = 0; + u64 gen; int direction = path->reada; struct extent_buffer *eb; u32 nr; @@ -1251,6 +1250,15 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; while (1) { + if (!node->map_token) { + unsigned long offset = btrfs_node_key_ptr_offset(nr); + map_private_extent_buffer(node, offset, + sizeof(struct btrfs_key_ptr), + &node->map_token, + &node->kaddr, + &node->map_start, + &node->map_len, KM_USER1); + } if (direction < 0) { if (nr == 0) break; @@ -1268,14 +1276,23 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, nr); if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - readahead_tree_block(root, search, blocksize, - btrfs_node_ptr_generation(node, nr)); + gen = btrfs_node_ptr_generation(node, nr); + if (node->map_token) { + unmap_extent_buffer(node, node->map_token, + KM_USER1); + node->map_token = NULL; + } + readahead_tree_block(root, search, blocksize, gen); nread += blocksize; } nscan++; if ((nread > 65536 || nscan > 32)) break; } + if (node->map_token) { + unmap_extent_buffer(node, node->map_token, KM_USER1); + node->map_token = NULL; + } } /* @@ -1648,9 +1665,6 @@ again: } cow_done: BUG_ON(!cow && ins_len); - if (level != btrfs_header_level(b)) - WARN_ON(1); - level = btrfs_header_level(b); p->nodes[level] = b; if (!p->skip_locking) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 332323e19dd1..378b5b4443f3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -930,7 +930,6 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - u64 open_ioctl_trans; unsigned long mount_opt:20; unsigned long compress_type:4; u64 max_inline; @@ -947,7 +946,6 @@ struct btrfs_fs_info { struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; - struct mutex trans_mutex; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; @@ -968,6 +966,7 @@ struct btrfs_fs_info { struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; + spinlock_t trans_lock; struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -980,6 +979,7 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; + atomic_t open_ioctl_trans; /* * this is used by the balancing code to wait for all the pending @@ -1044,6 +1044,7 @@ struct btrfs_fs_info { int closing; int log_root_recovering; int enospc_unlink; + int trans_no_join; u64 total_pinned; @@ -1065,7 +1066,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; spinlock_t delalloc_lock; - spinlock_t new_trans_lock; u64 delalloc_bytes; /* data_alloc_cluster is only used in ssd mode */ @@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) +#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2238,6 +2239,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, @@ -2350,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* + * Get synced with close_ctree() + */ + smp_mb(); + return fs_info->closing; +} + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, @@ -2512,8 +2525,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint); + struct btrfs_root *new_root, u64 new_dirid); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); @@ -2524,7 +2536,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -void btrfs_dirty_inode(struct inode *inode); +void btrfs_dirty_inode(struct inode *inode, int flags); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 01e29503a54b..6462c29d2d37 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&head); next = item; + nitems = 0; /* * count the number of the continuous items that we can insert in batch @@ -1129,7 +1130,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) delayed_node = async_node->delayed_node; root = delayed_node->root; - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto free_path; @@ -1572,8 +1573,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); - btrfs_set_stack_inode_block_group(inode_item, - BTRFS_I(inode)->block_group); + btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), inode->i_atime.tv_sec); @@ -1595,7 +1595,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_delayed_node *delayed_node; - int ret; + int ret = 0; delayed_node = btrfs_get_or_create_delayed_node(inode); if (IS_ERR(delayed_node)) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 98b6a71decba..a203d363184d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1505,24 +1505,24 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - spin_lock(&root->fs_info->new_trans_lock); + spin_lock(&root->fs_info->trans_lock); cur = root->fs_info->running_transaction; if (!cur) { - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); goto sleep; } now = get_seconds(); if (!cur->blocked && (now < cur->start_time || now - cur->start_time < 30)) { - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; goto sleep; } transid = cur->transid; - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); if (transid == trans->transid) { ret = btrfs_commit_transaction(trans, root); @@ -1613,7 +1613,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); - spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); @@ -1645,6 +1645,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; + fs_info->trans_no_join = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1709,7 +1710,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->do_barriers = 1; - mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); @@ -2479,13 +2479,13 @@ int btrfs_commit_super(struct btrfs_root *root) down_write(&root->fs_info->cleanup_work_sem); up_write(&root->fs_info->cleanup_work_sem); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_commit_transaction(trans, root); @@ -3024,10 +3024,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) WARN_ON(1); - mutex_lock(&root->fs_info->trans_mutex); mutex_lock(&root->fs_info->transaction_kthread_mutex); + spin_lock(&root->fs_info->trans_lock); list_splice_init(&root->fs_info->trans_list, &list); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); if (!t) @@ -3052,23 +3055,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) t->blocked = 0; if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); - mutex_unlock(&root->fs_info->trans_mutex); - mutex_lock(&root->fs_info->trans_mutex); t->commit_done = 1; if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); - mutex_unlock(&root->fs_info->trans_mutex); - - mutex_lock(&root->fs_info->trans_mutex); btrfs_destroy_pending_snapshots(t); btrfs_destroy_delalloc_inodes(root); - spin_lock(&root->fs_info->new_trans_lock); + spin_lock(&root->fs_info->trans_lock); root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); btrfs_destroy_marked_extents(root, &t->dirty_pages, EXTENT_DIRTY); @@ -3082,8 +3080,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) kmem_cache_free(btrfs_transaction_cachep, t); } + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 169bd62ce776..5b9b6b6df242 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -348,7 +348,7 @@ static int caching_kthread(void *data) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = 1; key.objectid = last; key.offset = 0; @@ -366,8 +366,7 @@ again: nritems = btrfs_header_nritems(leaf); while (1) { - smp_mb(); - if (fs_info->closing > 1) { + if (btrfs_fs_closing(fs_info) > 1) { last = (u64)-1; break; } @@ -379,15 +378,18 @@ again: if (ret) break; - caching_ctl->progress = last; - btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); - mutex_unlock(&caching_ctl->mutex); - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - else + if (need_resched() || + btrfs_next_leaf(extent_root, path)) { + caching_ctl->progress = last; + btrfs_release_path(path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + goto again; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; } if (key.objectid < block_group->key.objectid) { @@ -3065,7 +3067,7 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3091,9 +3093,10 @@ alloc: /* commit the current transaction and try again */ commit_trans: - if (!committed && !root->fs_info->open_ioctl_trans) { + if (!committed && + !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); @@ -3472,7 +3475,7 @@ again: goto out; ret = -ENOSPC; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto out; ret = btrfs_commit_transaction(trans, root); @@ -3699,7 +3702,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (trans) return -EAGAIN; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); return 0; @@ -3837,6 +3840,37 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); } +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; + u64 num_bytes; + int ret; + + /* + * Truncate should be freeing data, but give us 2 items just in case it + * needs to use some space. We may want to be smarter about this in the + * future. + */ + num_bytes = btrfs_calc_trans_metadata_size(root, 2); + + /* We already have enough bytes, just return */ + if (rsv->reserved >= num_bytes) + return 0; + + num_bytes -= rsv->reserved; + + /* + * You should have reserved enough space before hand to do this, so this + * should not fail. + */ + ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); + BUG_ON(ret); + + return 0; +} + int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, int num_items) @@ -3877,23 +3911,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* - * one for deleting orphan item, one for updating inode and - * two for calling btrfs_truncate_inode_items. - * - * btrfs_truncate_inode_items is a delete operation, it frees - * more space than it uses in most cases. So two units of - * metadata space should be enough for calling it many times. - * If all of the metadata space is used, we can commit - * transaction and use space it freed. + * We need to hold space in order to delete our orphan item once we've + * added it, so this takes the reservation so we can release it later + * when we are truly done with the orphan item. */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } @@ -4987,6 +5016,15 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + /* * Ok we want to try and use the cluster allocator, so lets look * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will @@ -5150,6 +5188,7 @@ checks: btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); + btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; @@ -5242,14 +5281,7 @@ loop: ret = -ENOSPC; } else if (!ins->objectid) { ret = -ENOSPC; - } - - /* we found what we needed */ - if (ins->objectid) { - if (!(data & BTRFS_BLOCK_GROUP_DATA)) - trans->block_group = block_group->key.objectid; - - btrfs_put_block_group(block_group); + } else if (ins->objectid) { ret = 0; } @@ -6526,7 +6558,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); alloc_flags = update_block_group_flags(root, cache->flags); @@ -6882,6 +6914,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); if (cache_gen != 0 && diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c5d9fbb92bc3..7055d11c1efd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1476,7 +1476,7 @@ u64 count_range_bits(struct extent_io_tree *tree, if (total_bytes >= max_bytes) break; if (!found) { - *start = state->start; + *start = max(cur_start, state->start); found = 1; } last = state->end; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c6a22d783c35..fa4ef18b66b1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!btrfs_test_opt(root, AUTO_DEFRAG)) return 0; - if (root->fs_info->closing) + if (btrfs_fs_closing(root->fs_info)) return 0; if (BTRFS_I(inode)->in_defrag) @@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!defrag) return -ENOMEM; - defrag->ino = inode->i_ino; + defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; @@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) goto next_free; spin_unlock(&fs_info->defrag_inodes_lock); @@ -1480,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync) * the current transaction, we can bail out now without any * syncing */ - mutex_lock(&root->fs_info->trans_mutex); + smp_mb(); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; - mutex_unlock(&root->fs_info->trans_mutex); goto out; } - mutex_unlock(&root->fs_info->trans_mutex); /* * ok we haven't committed the transaction yet, lets do a commit diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 70d45795d758..ad144736a5fd 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (!root->fs_info->closing) { + if (!btrfs_fs_closing(root->fs_info)) { block_group->inode = igrab(inode); block_group->iref = 1; } @@ -402,7 +402,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); spin_unlock(&ctl->tree_lock); - BUG_ON(ret); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } } else { e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { @@ -419,6 +426,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ctl->op->recalc_thresholds(ctl); spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } } num_entries--; @@ -478,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. */ - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) return 0; /* @@ -575,10 +589,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + /* Since the first page has all of our checksums and our generation we + * need to calculate the offset into the page that we can start writing + * our entries. + */ + first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); + filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); + /* make sure we don't overflow that first page */ + if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { + /* this is really the same as running out of space, where we also return 0 */ + printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); + ret = 0; + goto out_update; + } + /* We need a checksum per page. */ crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); if (!crc) @@ -590,12 +619,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, return -1; } - /* Since the first page has all of our checksums and our generation we - * need to calculate the offset into the page that we can start writing - * our entries. - */ - first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); - /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) cluster = list_entry(block_group->cluster_list.next, @@ -857,12 +880,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = 1; out_free: + kfree(checksums); + kfree(pages); + +out_update: if (ret != 1) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; } - kfree(checksums); - kfree(pages); btrfs_update_inode(trans, root, inode); return ret; } @@ -963,10 +988,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * logically. */ if (bitmap) { - WARN_ON(info->bitmap); + if (info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_right; } else { - WARN_ON(!info->bitmap); + if (!info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_left; } } @@ -2481,7 +2512,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, return inode; spin_lock(&root->cache_lock); - if (!root->fs_info->closing) + if (!btrfs_fs_closing(root->fs_info)) root->cache_inode = igrab(inode); spin_unlock(&root->cache_lock); @@ -2504,12 +2535,14 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + /* * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. */ - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) return 0; path = btrfs_alloc_path(); @@ -2543,6 +2576,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct inode *inode; int ret; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode)) return 0; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 3262cd17a12f..b4087e0fa871 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,6 +38,9 @@ static int caching_kthread(void *data) int slot; int ret; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -59,8 +62,7 @@ again: goto out; while (1) { - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) goto out; leaf = path->nodes[0]; @@ -141,6 +143,9 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + spin_lock(&root->cache_lock); if (root->cached != BTRFS_CACHE_NO) { spin_unlock(&root->cache_lock); @@ -178,6 +183,9 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return btrfs_find_free_objectid(root, objectid); + again: *objectid = btrfs_find_ino_for_alloc(root); @@ -201,6 +209,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + again: if (root->cached == BTRFS_CACHE_FINISHED) { __btrfs_add_free_space(ctl, objectid, 1); @@ -250,6 +262,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + while (1) { n = rb_first(rbroot); if (!n) @@ -388,9 +403,24 @@ int btrfs_save_ino_cache(struct btrfs_root *root, int prealloc; bool retry = false; + /* only fs tree and subvol/snap needs ino cache */ + if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && + (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || + root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) + return 0; + + /* Don't save inode cache if we are deleting this root */ + if (btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) + return 0; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; + again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bb51bb1fa44f..ebf95f7a44d6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -138,7 +138,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; - btrfs_set_trans_block_group(trans, inode); key.objectid = btrfs_ino(inode); key.offset = start; @@ -426,9 +425,8 @@ again: } } if (start == 0) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ @@ -623,8 +621,9 @@ retry: async_extent->start + async_extent->ram_size - 1, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, @@ -793,9 +792,8 @@ static noinline int cow_file_range(struct inode *inode, int ret = 0; BUG_ON(is_free_space_inode(root, inode)); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); @@ -1077,10 +1075,12 @@ static noinline int run_delalloc_nocow(struct inode *inode, nolock = is_free_space_inode(root, inode); if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; cow_start = (u64)-1; cur_offset = start; @@ -1519,8 +1519,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, { struct btrfs_ordered_sum *sum; - btrfs_set_trans_block_group(trans, inode); - list_for_each_entry(sum, list, list) { btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); @@ -1735,11 +1733,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -1752,11 +1749,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 0, &cached_state, GFP_NOFS); if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2431,7 +2427,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) (u64)-1); if (root->orphan_block_rsv || root->orphan_item_inserted) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans, root); } @@ -2511,12 +2507,12 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; int maybe_acls; - u64 alloc_group_block; u32 rdev; int ret; path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -2526,6 +2522,12 @@ static void btrfs_read_locked_inode(struct inode *inode) leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + if (!leaf->map_token) + map_private_extent_buffer(leaf, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); inode->i_mode = btrfs_inode_mode(leaf, inode_item); inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); @@ -2555,8 +2557,6 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - alloc_group_block = btrfs_inode_block_group(leaf, inode_item); - /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls @@ -2566,8 +2566,11 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!maybe_acls) cache_no_acl(inode); - BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, - alloc_group_block, 0); + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + btrfs_free_path(path); inode_item = NULL; @@ -2647,7 +2650,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_transid(leaf, item, trans->transid); btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); + btrfs_set_inode_block_group(leaf, item, 0); if (leaf->map_token) { unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); @@ -3004,8 +3007,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, @@ -3094,8 +3095,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, root, dir, BTRFS_I(inode)->location.objectid, @@ -3514,7 +3513,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) err = PTR_ERR(trans); break; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, @@ -3650,7 +3648,6 @@ void btrfs_evict_inode(struct inode *inode) while (1) { trans = btrfs_start_transaction(root, 0); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; ret = btrfs_block_rsv_check(trans, root, @@ -4133,7 +4130,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + + path->reada = 1; if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); @@ -4268,18 +4266,16 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (BTRFS_I(inode)->dummy_inode) return 0; - smp_mb(); - if (root->fs_info->closing && is_free_space_inode(root, inode)) + if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); if (nolock) ret = btrfs_end_transaction_nolock(trans, root); else @@ -4294,7 +4290,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -void btrfs_dirty_inode(struct inode *inode) +void btrfs_dirty_inode(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4303,9 +4299,8 @@ void btrfs_dirty_inode(struct inode *inode) if (BTRFS_I(inode)->dummy_inode) return; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); if (ret && ret == -ENOSPC) { @@ -4319,7 +4314,6 @@ void btrfs_dirty_inode(struct inode *inode) PTR_ERR(trans)); return; } - btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); if (ret) { @@ -4418,8 +4412,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, const char *name, int name_len, - u64 ref_objectid, u64 objectid, - u64 alloc_hint, int mode, u64 *index) + u64 ref_objectid, u64 objectid, int mode, + u64 *index) { struct inode *inode; struct btrfs_inode_item *inode_item; @@ -4472,8 +4466,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 0; else owner = 1; - BTRFS_I(inode)->block_group = - btrfs_find_block_group(root, 0, alloc_hint, owner); key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -4629,15 +4621,13 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_ino(root, &objectid); if (err) goto out_unlock; inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(dir), objectid, - BTRFS_I(dir)->block_group, mode, &index); + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -4649,7 +4639,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -4658,8 +4647,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); @@ -4692,15 +4679,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_ino(root, &objectid); if (err) goto out_unlock; inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(dir), objectid, - BTRFS_I(dir)->block_group, mode, &index); + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -4712,7 +4697,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -4723,8 +4707,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); @@ -4771,8 +4753,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_inc_nlink(inode); inode->i_ctime = CURRENT_TIME; - - btrfs_set_trans_block_group(trans, dir); ihold(inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); @@ -4781,7 +4761,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dget_parent(dentry); - btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); BUG_ON(err); btrfs_log_new_name(trans, inode, NULL, parent); @@ -4818,7 +4797,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_ino(root, &objectid); if (err) @@ -4826,8 +4804,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(dir), objectid, - BTRFS_I(dir)->block_group, S_IFDIR | mode, - &index); + S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fail; @@ -4841,7 +4818,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); @@ -4855,8 +4831,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); drop_on_err = 0; - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_fail: nr = trans->blocks_used; @@ -4989,7 +4963,15 @@ again: if (!path) { path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + goto out; + } + /* + * Chances are we'll be called again, so go ahead and do + * readahead + */ + path->reada = 1; } ret = btrfs_lookup_file_extent(trans, root, path, @@ -5130,8 +5112,10 @@ again: kunmap(page); free_extent_map(em); em = NULL; + btrfs_release_path(path); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) return ERR_CAST(trans); goto again; @@ -5375,7 +5359,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + len - 1, 0); } - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return ERR_CAST(trans); @@ -5611,7 +5595,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * to make sure the current transaction stays open * while we look for nocow cross refs */ - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto must_cow; @@ -5750,7 +5734,7 @@ again: BUG_ON(!ordered); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { err = -ENOMEM; goto out; @@ -6500,6 +6484,7 @@ out: static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv; int ret; int err = 0; struct btrfs_trans_handle *trans; @@ -6513,28 +6498,80 @@ static int btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); + /* + * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * 3 things going on here + * + * 1) We need to reserve space for our orphan item and the space to + * delete our orphan item. Lord knows we don't want to have a dangling + * orphan item because we didn't reserve space to remove it. + * + * 2) We need to reserve space to update our inode. + * + * 3) We need to have something to cache all the space that is going to + * be free'd up by the truncate operation, but also have some slack + * space reserved in case it uses space during the truncate (thank you + * very much snapshotting). + * + * And we need these to all be seperate. The fact is we can use alot of + * space doing the truncate, and we have no earthly idea how much space + * we will use, so we need the truncate reservation to be seperate so it + * doesn't end up using space reserved for updating the inode or + * removing the orphan item. We also need to be able to stop the + * transaction and start a new one, which means we need to be able to + * update the inode several times, and we have no idea of knowing how + * many times that will be, so we can't just reserve 1 item for the + * entirety of the opration, so that has to be done seperately as well. + * Then there is the orphan item, which does indeed need to be held on + * to for the whole operation, and we need nobody to touch this reserved + * space except the orphan code. + * + * So that leaves us with + * + * 1) root->orphan_block_rsv - for the orphan deletion. + * 2) rsv - for the truncate reservation, which we will steal from the + * transaction reservation. + * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * updating the inode. + */ + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) + return -ENOMEM; + btrfs_add_durable_block_rsv(root->fs_info, rsv); + + trans = btrfs_start_transaction(root, 4); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } - btrfs_set_trans_block_group(trans, inode); + /* + * Reserve space for the truncate process. Truncate should be adding + * space, but if there are snapshots it may end up using space. + */ + ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + BUG_ON(ret); ret = btrfs_orphan_add(trans, inode); if (ret) { btrfs_end_transaction(trans, root); - return ret; + goto out; } nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); - /* Now start a transaction for the truncate */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; + /* + * Ok so we've already migrated our bytes over for the truncate, so here + * just reserve the one slot we need for updating the inode. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } + trans->block_rsv = rsv; /* * setattr is responsible for setting the ordered_data_close flag, @@ -6558,24 +6595,17 @@ static int btrfs_truncate(struct inode *inode) while (1) { if (!trans) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; - } + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, 0, 5); - if (ret == -EAGAIN) { - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - trans = NULL; - continue; - } else if (ret) { - err = ret; - break; + ret = btrfs_truncate_reserve_metadata(trans, root, + rsv); + BUG_ON(ret); + + trans->block_rsv = rsv; } ret = btrfs_truncate_inode_items(trans, root, inode, @@ -6586,6 +6616,7 @@ static int btrfs_truncate(struct inode *inode) break; } + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret) { err = ret; @@ -6599,6 +6630,7 @@ static int btrfs_truncate(struct inode *inode) } if (ret == 0 && inode->i_nlink > 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; @@ -6610,15 +6642,20 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(NULL, inode); } + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret && !err) err = ret; nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + +out: + btrfs_free_block_rsv(root, rsv); + if (ret && !err) err = ret; - btrfs_btree_balance_dirty(root, nr); return err; } @@ -6627,15 +6664,14 @@ static int btrfs_truncate(struct inode *inode) * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint) + struct btrfs_root *new_root, u64 new_dirid) { struct inode *inode; int err; u64 index = 0; inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, alloc_hint, S_IFDIR | 0700, &index); + new_dirid, S_IFDIR | 0700, &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; @@ -6748,21 +6784,6 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - if (root == root->fs_info->tree_root) { - struct btrfs_block_group_cache *block_group; - - block_group = btrfs_lookup_block_group(root->fs_info, - BTRFS_I(inode)->block_group); - if (block_group && block_group->inode == inode) { - spin_lock(&block_group->lock); - block_group->inode = NULL; - spin_unlock(&block_group->lock); - btrfs_put_block_group(block_group); - } else if (block_group) { - btrfs_put_block_group(block_group); - } - } - spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", @@ -6948,8 +6969,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_notrans; } - btrfs_set_trans_block_group(trans, new_dir); - if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -7131,16 +7150,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_ino(root, &objectid); if (err) goto out_unlock; inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(dir), objectid, - BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, - &index); + S_IFLNK|S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -7152,7 +7168,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -7163,8 +7178,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); if (drop_inode) goto out_unlock; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 85e818ce00c5..ac37040e426a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -243,7 +243,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_update_inode(trans, root, inode); @@ -414,8 +414,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, new_dirid, - BTRFS_I(dir)->block_group); + ret = btrfs_create_subvol_root(trans, new_root, new_dirid); /* * insert the directory item */ @@ -707,16 +706,17 @@ static int find_new_extents(struct btrfs_root *root, struct btrfs_file_extent_item *extent; int type; int ret; + u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - min_key.objectid = inode->i_ino; + min_key.objectid = ino; min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; - max_key.objectid = inode->i_ino; + max_key.objectid = ino; max_key.type = (u8)-1; max_key.offset = (u64)-1; @@ -727,7 +727,7 @@ static int find_new_extents(struct btrfs_root *root, path, 0, newer_than); if (ret != 0) goto none; - if (min_key.objectid != inode->i_ino) + if (min_key.objectid != ino) goto none; if (min_key.type != BTRFS_EXTENT_DATA_KEY) goto none; @@ -2489,12 +2489,10 @@ static long btrfs_ioctl_trans_start(struct file *file) if (ret) goto out; - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans++; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_inc(&root->fs_info->open_ioctl_trans); ret = -ENOMEM; - trans = btrfs_start_ioctl_transaction(root, 0); + trans = btrfs_start_ioctl_transaction(root); if (IS_ERR(trans)) goto out_drop; @@ -2502,9 +2500,7 @@ static long btrfs_ioctl_trans_start(struct file *file) return 0; out_drop: - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_dec(&root->fs_info->open_ioctl_trans); mnt_drop_write(file->f_path.mnt); out: return ret; @@ -2738,9 +2734,7 @@ long btrfs_ioctl_trans_end(struct file *file) btrfs_end_transaction(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_dec(&root->fs_info->open_ioctl_trans); mnt_drop_write(file->f_path.mnt); return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ca38eca70af0..b1ef27cc673b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -677,6 +677,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } + path1->reada = 1; + path2->reada = 2; node = alloc_backref_node(cache); if (!node) { @@ -1999,6 +2001,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -2139,10 +2142,10 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2152,7 +2155,7 @@ again: err = ret; } - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(rc->extent_root, @@ -2211,9 +2214,9 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); list_splice_init(&rc->reloc_roots, &reloc_roots); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); while (!list_empty(&reloc_roots)) { found = 1; @@ -3236,7 +3239,7 @@ truncate: goto out; } - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_free_path(path); ret = PTR_ERR(trans); @@ -3300,6 +3303,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3586,17 +3590,17 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); fs_info->reloc_ctl = rc; - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); fs_info->reloc_ctl = NULL; - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); } static int check_extent_flags(u64 flags) @@ -3645,7 +3649,7 @@ int prepare_to_relocate(struct reloc_control *rc) rc->create_reloc_tree = 1; set_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); BUG_ON(IS_ERR(trans)); btrfs_commit_transaction(trans, rc->extent_root); return 0; @@ -3668,6 +3672,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; ret = prepare_to_relocate(rc); if (ret) { @@ -3834,7 +3839,7 @@ restart: btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) err = PTR_ERR(trans); else @@ -4093,6 +4098,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = -1; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; @@ -4159,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) set_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { unset_reloc_control(rc); err = PTR_ERR(trans); @@ -4193,7 +4199,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) err = PTR_ERR(trans); else diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6dfed0c27ac3..df50fd1eca8f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -117,33 +117,37 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } +static void scrub_free_bio(struct bio *bio) +{ + int i; + struct page *last_page = NULL; + + if (!bio) + return; + + for (i = 0; i < bio->bi_vcnt; ++i) { + if (bio->bi_io_vec[i].bv_page == last_page) + continue; + last_page = bio->bi_io_vec[i].bv_page; + __free_page(last_page); + } + bio_put(bio); +} + static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) { int i; - int j; - struct page *last_page; if (!sdev) return; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; - struct bio *bio; if (!sbio) break; - bio = sbio->bio; - if (bio) { - last_page = NULL; - for (j = 0; j < bio->bi_vcnt; ++j) { - if (bio->bi_io_vec[j].bv_page == last_page) - continue; - last_page = bio->bi_io_vec[j].bv_page; - __free_page(last_page); - } - bio_put(bio); - } + scrub_free_bio(sbio->bio); kfree(sbio); } @@ -156,8 +160,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) { struct scrub_dev *sdev; int i; - int j; - int ret; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; sdev = kzalloc(sizeof(*sdev), GFP_NOFS); @@ -165,7 +167,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) goto nomem; sdev->dev = dev; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct bio *bio; struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); @@ -173,32 +174,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) goto nomem; sdev->bios[i] = sbio; - bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); - if (!bio) - goto nomem; - sbio->index = i; sbio->sdev = sdev; - sbio->bio = bio; sbio->count = 0; sbio->work.func = scrub_checksum; - bio->bi_private = sdev->bios[i]; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_sector = 0; - bio->bi_bdev = dev->bdev; - bio->bi_size = 0; - - for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) { - struct page *page; - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) - goto nomem; - } - WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO); if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -369,9 +348,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, int ret; DECLARE_COMPLETION_ONSTACK(complete); - /* we are going to wait on this IO */ - rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = bdev; bio->bi_sector = sector; @@ -380,6 +356,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio->bi_private = &complete; submit_bio(rw, bio); + /* this will also unplug the queue */ wait_for_completion(&complete); ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -394,6 +371,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; sbio->err = err; + sbio->bio = bio; btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); } @@ -453,6 +431,8 @@ static void scrub_checksum(struct btrfs_work *work) } out: + scrub_free_bio(sbio->bio); + sbio->bio = NULL; spin_lock(&sdev->list_lock); sbio->next_free = sdev->first_free; sdev->first_free = sbio->index; @@ -583,25 +563,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; + struct bio *bio; + int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - sbio->bio->bi_sector = sbio->physical >> 9; - sbio->bio->bi_size = sbio->count * PAGE_SIZE; - sbio->bio->bi_next = NULL; - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_comp_cpu = -1; - sbio->bio->bi_bdev = sdev->dev->bdev; + bio = bio_alloc(GFP_NOFS, sbio->count); + if (!bio) + goto nomem; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + + for (i = 0; i < sbio->count; ++i) { + struct page *page; + int ret; + + page = alloc_page(GFP_NOFS); + if (!page) + goto nomem; + + ret = bio_add_page(bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + goto nomem; + } + } + sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(0, sbio->bio); + submit_bio(READ, bio); return 0; + +nomem: + scrub_free_bio(bio); + + return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -633,7 +638,11 @@ again: sbio->logical = logical; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - scrub_submit(sdev); + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; goto again; } sbio->spag[sbio->count].flags = flags; @@ -645,8 +654,13 @@ again: memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); } ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) - scrub_submit(sdev); + if (sbio->count == SCRUB_PAGES_PER_BIO || force) { + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; + } return 0; } @@ -727,6 +741,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; + struct blk_plug plug; u64 flags; int ret; int slot; @@ -831,6 +846,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, * the scrub. This might currently (crc32) end up to be about 1MB */ start_stripe = 0; + blk_start_plug(&plug); again: logical = base + offset + start_stripe * increment; for (i = start_stripe; i < nstripes; ++i) { @@ -972,6 +988,7 @@ next: scrub_submit(sdev); out: + blk_finish_plug(&plug); btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -1166,7 +1183,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, int ret; struct btrfs_device *dev; - if (root->fs_info->closing) + if (btrfs_fs_closing(root->fs_info)) return -EINVAL; /* diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b2e7e5bc3ef..117e74e3604b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -161,7 +161,8 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, + Opt_inode_cache, Opt_err, }; static match_table_t tokens = { @@ -193,6 +194,7 @@ static match_table_t tokens = { {Opt_enospc_debug, "enospc_debug"}, {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, + {Opt_inode_cache, "inode_cache"}, {Opt_err, NULL}, }; @@ -361,6 +363,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_inode_cache: + printk(KERN_INFO "btrfs: enabling inode map caching\n"); + btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + break; case Opt_clear_cache: printk(KERN_INFO "btrfs: force clearing of disk cache\n"); btrfs_set_opt(info->mount_opt, CLEAR_CACHE); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dc80f7156923..dd719662340e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -35,6 +35,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { + BUG_ON(!list_empty(&transaction->list)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -49,46 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root) /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root) +static noinline int join_transaction(struct btrfs_root *root, int nofail) { struct btrfs_transaction *cur_trans; + + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->trans_no_join) { + if (!nofail) { + spin_unlock(&root->fs_info->trans_lock); + return -EBUSY; + } + } + cur_trans = root->fs_info->running_transaction; - if (!cur_trans) { - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, - GFP_NOFS); - if (!cur_trans) - return -ENOMEM; - root->fs_info->generation++; - atomic_set(&cur_trans->num_writers, 1); - cur_trans->num_joined = 0; - cur_trans->transid = root->fs_info->generation; - init_waitqueue_head(&cur_trans->writer_wait); - init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; - atomic_set(&cur_trans->use_count, 1); - cur_trans->commit_done = 0; - cur_trans->start_time = get_seconds(); - - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - spin_lock_init(&cur_trans->delayed_refs.lock); - - INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); - extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping); - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = cur_trans; - spin_unlock(&root->fs_info->new_trans_lock); - } else { + if (cur_trans) { + atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; } + spin_unlock(&root->fs_info->trans_lock); + + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + if (!cur_trans) + return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->running_transaction) { + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + cur_trans = root->fs_info->running_transaction; + atomic_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; + } + atomic_set(&cur_trans->num_writers, 1); + cur_trans->num_joined = 0; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + /* + * One for this trans handle, one so it will live on until we + * commit the transaction. + */ + atomic_set(&cur_trans->use_count, 2); + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root = RB_ROOT; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + spin_lock_init(&cur_trans->commit_lock); + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping); + root->fs_info->generation++; + cur_trans->transid = root->fs_info->generation; + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -99,39 +126,28 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + spin_lock(&root->fs_info->fs_roots_radix_lock); + if (root->last_trans == trans->transid) { + spin_unlock(&root->fs_info->fs_roots_radix_lock); + return 0; + } + root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); - root->last_trans = trans->transid; + spin_unlock(&root->fs_info->fs_roots_radix_lock); btrfs_init_reloc_root(trans, root); } return 0; } -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (!root->ref_cows) - return 0; - - mutex_lock(&root->fs_info->trans_mutex); - if (root->last_trans == trans->transid) { - mutex_unlock(&root->fs_info->trans_mutex); - return 0; - } - - record_root_in_trans(trans, root); - mutex_unlock(&root->fs_info->trans_mutex); - return 0; -} - /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -140,21 +156,23 @@ static void wait_current_trans(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; + spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; if (cur_trans && cur_trans->blocked) { DEFINE_WAIT(wait); atomic_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); while (1) { prepare_to_wait(&root->fs_info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); if (!cur_trans->blocked) break; - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); } finish_wait(&root->fs_info->transaction_wait, &wait); put_transaction(cur_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } } @@ -167,10 +185,16 @@ enum btrfs_trans_type { static int may_wait_transaction(struct btrfs_root *root, int type) { - if (!root->fs_info->log_root_recovering && - ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || - type == TRANS_USERSPACE)) + if (root->fs_info->log_root_recovering) + return 0; + + if (type == TRANS_USERSPACE) + return 1; + + if (type == TRANS_START && + !atomic_read(&root->fs_info->open_ioctl_trans)) return 1; + return 0; } @@ -184,36 +208,44 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return ERR_PTR(-EROFS); + + if (current->journal_info) { + WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + h = current->journal_info; + h->use_count++; + h->orig_rsv = h->block_rsv; + h->block_rsv = NULL; + goto got_it; + } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) return ERR_PTR(-ENOMEM); - if (type != TRANS_JOIN_NOLOCK) - mutex_lock(&root->fs_info->trans_mutex); if (may_wait_transaction(root, type)) wait_current_trans(root); - ret = join_transaction(root); + do { + ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); + if (ret == -EBUSY) + wait_current_trans(root); + } while (ret == -EBUSY); + if (ret < 0) { kmem_cache_free(btrfs_trans_handle_cachep, h); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); return ERR_PTR(ret); } cur_trans = root->fs_info->running_transaction; - atomic_inc(&cur_trans->use_count); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); h->transid = cur_trans->transid; h->transaction = cur_trans; h->blocks_used = 0; - h->block_group = 0; h->bytes_reserved = 0; h->delayed_ref_updates = 0; + h->use_count = 1; h->block_rsv = NULL; + h->orig_rsv = NULL; smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -241,11 +273,8 @@ again: } } - if (type != TRANS_JOIN_NOLOCK) - mutex_lock(&root->fs_info->trans_mutex); - record_root_in_trans(h, root); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); +got_it: + btrfs_record_root_in_trans(h, root); if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; @@ -257,22 +286,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, { return start_transaction(root, num_items, TRANS_START); } -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks) +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN); } -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, - int num_blocks) +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK); } -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks) +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(r, 0, TRANS_USERSPACE); + return start_transaction(root, 0, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ @@ -280,17 +306,13 @@ static noinline int wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { DEFINE_WAIT(wait); - mutex_lock(&root->fs_info->trans_mutex); while (!commit->commit_done) { prepare_to_wait(&commit->commit_wait, &wait, TASK_UNINTERRUPTIBLE); if (commit->commit_done) break; - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); } - mutex_unlock(&root->fs_info->trans_mutex); finish_wait(&commit->commit_wait, &wait); return 0; } @@ -300,59 +322,56 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) struct btrfs_transaction *cur_trans = NULL, *t; int ret; - mutex_lock(&root->fs_info->trans_mutex); - ret = 0; if (transid) { if (transid <= root->fs_info->last_trans_committed) - goto out_unlock; + goto out; /* find specified transaction */ + spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; + atomic_inc(&cur_trans->use_count); break; } if (t->transid > transid) break; } + spin_unlock(&root->fs_info->trans_lock); ret = -EINVAL; if (!cur_trans) - goto out_unlock; /* bad transid */ + goto out; /* bad transid */ } else { /* find newest transaction that is committing | committed */ + spin_lock(&root->fs_info->trans_lock); list_for_each_entry_reverse(t, &root->fs_info->trans_list, list) { if (t->in_commit) { if (t->commit_done) - goto out_unlock; + goto out; cur_trans = t; + atomic_inc(&cur_trans->use_count); break; } } + spin_unlock(&root->fs_info->trans_lock); if (!cur_trans) - goto out_unlock; /* nothing committing|committed */ + goto out; /* nothing committing|committed */ } - atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); - wait_for_commit(root, cur_trans); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(cur_trans); ret = 0; -out_unlock: - mutex_unlock(&root->fs_info->trans_mutex); +out: return ret; } void btrfs_throttle(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->open_ioctl_trans) + if (!atomic_read(&root->fs_info->open_ioctl_trans)) wait_current_trans(root); - mutex_unlock(&root->fs_info->trans_mutex); } static int should_end_transaction(struct btrfs_trans_handle *trans, @@ -370,6 +389,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; int updates; + smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; @@ -388,6 +408,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info = root->fs_info; int count = 0; + if (--trans->use_count) { + trans->block_rsv = trans->orig_rsv; + return 0; + } + while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -410,9 +435,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); - if (lock && !root->fs_info->open_ioctl_trans && - should_end_transaction(trans, root)) + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && + should_end_transaction(trans, root)) { trans->transaction->blocked = 1; + smp_wmb(); + } if (lock && cur_trans->blocked && !cur_trans->in_commit) { if (throttle) @@ -703,9 +730,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, */ int btrfs_add_dead_root(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); list_add(&root->root_list, &root->fs_info->dead_roots); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -721,6 +748,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, int ret; int err = 0; + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, (void **)gang, 0, @@ -733,6 +761,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); @@ -753,10 +782,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; } } + spin_unlock(&fs_info->fs_roots_radix_lock); return err; } @@ -786,7 +817,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) btrfs_btree_balance_dirty(info->tree_root, nr); cond_resched(); - if (root->fs_info->closing || ret != -EAGAIN) + if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; } root->defrag_running = 0; @@ -851,7 +882,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + btrfs_record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -869,7 +900,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - record_root_in_trans(trans, root); + btrfs_record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -967,20 +998,20 @@ static void update_super_roots(struct btrfs_root *root) int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { int ret = 0; - spin_lock(&info->new_trans_lock); + spin_lock(&info->trans_lock); if (info->running_transaction) ret = info->running_transaction->in_commit; - spin_unlock(&info->new_trans_lock); + spin_unlock(&info->trans_lock); return ret; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { int ret = 0; - spin_lock(&info->new_trans_lock); + spin_lock(&info->trans_lock); if (info->running_transaction) ret = info->running_transaction->blocked; - spin_unlock(&info->new_trans_lock); + spin_unlock(&info->trans_lock); return ret; } @@ -1004,9 +1035,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root, &wait); break; } - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&root->fs_info->transaction_blocked_wait, &wait); } } @@ -1032,9 +1061,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, &wait); break; } - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&root->fs_info->transaction_wait, &wait); } @@ -1072,7 +1099,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; - ac->newtrans = btrfs_join_transaction(root, 0); + ac->newtrans = btrfs_join_transaction(root); if (IS_ERR(ac->newtrans)) { int err = PTR_ERR(ac->newtrans); kfree(ac); @@ -1080,22 +1107,18 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, } /* take transaction reference */ - mutex_lock(&root->fs_info->trans_mutex); cur_trans = trans->transaction; atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); schedule_delayed_work(&ac->work, 0); /* wait for transaction to start and unblock */ - mutex_lock(&root->fs_info->trans_mutex); if (wait_for_unblock) wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); put_transaction(cur_trans); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } @@ -1139,38 +1162,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { + spin_unlock(&cur_trans->commit_lock); atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); ret = wait_for_commit(root, cur_trans); BUG_ON(ret); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(cur_trans); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } trans->transaction->in_commit = 1; trans->transaction->blocked = 1; + spin_unlock(&cur_trans->commit_lock); wake_up(&root->fs_info->transaction_blocked_wait); + spin_lock(&root->fs_info->trans_lock); if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (!prev_trans->commit_done) { atomic_inc(&prev_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(prev_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } + } else { + spin_unlock(&root->fs_info->trans_lock); } if (now < cur_trans->start_time || now - cur_trans->start_time < 1) @@ -1178,12 +1204,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, do { int snap_pending = 0; + joined = cur_trans->num_joined; if (!list_empty(&trans->transaction->pending_snapshots)) snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); @@ -1206,14 +1232,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); - smp_mb(); if (atomic_read(&cur_trans->num_writers) > 1) schedule_timeout(MAX_SCHEDULE_TIMEOUT); else if (should_grow) schedule_timeout(1); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); @@ -1258,9 +1285,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->new_trans_lock); btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); @@ -1281,10 +1305,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, sizeof(root->fs_info->super_copy)); trans->transaction->blocked = 0; + spin_lock(&root->fs_info->trans_lock); + root->fs_info->running_transaction = NULL; + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); wake_up(&root->fs_info->transaction_wait); - mutex_unlock(&root->fs_info->trans_mutex); ret = btrfs_write_and_wait_transaction(trans, root); BUG_ON(ret); write_ctree_super(trans, root, 0); @@ -1297,22 +1324,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - cur_trans->commit_done = 1; root->fs_info->last_trans_committed = cur_trans->transid; wake_up(&cur_trans->commit_wait); + spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); + put_transaction(cur_trans); put_transaction(cur_trans); trace_btrfs_transaction_commit(root); - mutex_unlock(&root->fs_info->trans_mutex); - btrfs_scrub_continue(root); if (current->journal_info == trans) @@ -1334,9 +1360,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); list_splice_init(&fs_info->dead_roots, &list); - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); while (!list_empty(&list)) { root = list_entry(list.next, struct btrfs_root, root_list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 804c88639e5d..02564e6230ac 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -28,10 +28,12 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; + atomic_t use_count; unsigned long num_joined; + + spinlock_t commit_lock; int in_commit; - atomic_t use_count; int commit_done; int blocked; struct list_head list; @@ -45,13 +47,14 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; - u64 block_group; u64 bytes_reserved; + unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *orig_rsv; }; struct btrfs_pending_snapshot { @@ -66,19 +69,6 @@ struct btrfs_pending_snapshot { struct list_head list; }; -static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - trans->block_group = BTRFS_I(inode)->block_group; -} - -static inline void btrfs_update_inode_block_group( - struct btrfs_trans_handle *trans, - struct inode *inode) -{ - BTRFS_I(inode)->block_group = trans->block_group; -} - static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -92,12 +82,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c48214ef5c09..da541dfca2e3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) BUG_ON(!new_device); memcpy(new_device, device, sizeof(*new_device)); new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(!new_device->name); + BUG_ON(device->name && !new_device->name); new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index f3107e4b4d56..5366fe452ab0 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - ret = do_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; diff --git a/fs/buffer.c b/fs/buffer.c index 698c6b2cc462..49c9aada0374 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2382,6 +2382,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, ret = -EAGAIN; goto out_unlock; } + wait_on_page_writeback(page); return 0; out_unlock: unlock_page(page); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 8f1700623b41..21de1d6d5849 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -74,8 +74,9 @@ shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem, * Run idmap cache shrinker. */ static int -cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) { + int nr_to_scan = sc->nr_to_scan; int nr_del = 0; int nr_rem = 0; struct rb_root *root; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index a46126fd5735..2b8dae4d121e 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) int len = de->d_name.len; int error; - dentry_unhash(de); - error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); if (!error) { /* VFS may delete the child */ @@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, int new_length = new_dentry->d_name.len; int error; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 9d17d350abc5..9a37a9b6de3a 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; - dentry_unhash(dentry); - if (dentry->d_parent == configfs_sb->s_root) return -EPERM; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index b8d5c8091024..58609bde3b9f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1024,25 +1024,25 @@ out: } /** - * contains_ecryptfs_marker - check for the ecryptfs marker + * ecryptfs_validate_marker - check for the ecryptfs marker * @data: The data block in which to check * - * Returns one if marker found; zero if not found + * Returns zero if marker found; -EINVAL if not found */ -static int contains_ecryptfs_marker(char *data) +static int ecryptfs_validate_marker(char *data) { u32 m_1, m_2; m_1 = get_unaligned_be32(data); m_2 = get_unaligned_be32(data + 4); if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) - return 1; + return 0; ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2, MAGIC_ECRYPTFS_MARKER); ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = " "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER)); - return 0; + return -EINVAL; } struct ecryptfs_flag_map_elem { @@ -1201,27 +1201,19 @@ int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code) return rc; } -int ecryptfs_read_and_validate_header_region(char *data, - struct inode *ecryptfs_inode) +int ecryptfs_read_and_validate_header_region(struct inode *inode) { - struct ecryptfs_crypt_stat *crypt_stat = - &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); + u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES]; + u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES; int rc; - if (crypt_stat->extent_size == 0) - crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; - rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, - ecryptfs_inode); - if (rc < 0) { - printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", - __func__, rc); - goto out; - } - if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { - rc = -EINVAL; - } else - rc = 0; -out: + rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES, + inode); + if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return rc >= 0 ? -EINVAL : rc; + rc = ecryptfs_validate_marker(marker); + if (!rc) + ecryptfs_i_size_init(file_size, inode); return rc; } @@ -1242,8 +1234,7 @@ ecryptfs_write_header_metadata(char *virt, (*written) = 6; } -struct kmem_cache *ecryptfs_header_cache_1; -struct kmem_cache *ecryptfs_header_cache_2; +struct kmem_cache *ecryptfs_header_cache; /** * ecryptfs_write_headers_virt @@ -1496,11 +1487,9 @@ static int ecryptfs_read_headers_virt(char *page_virt, crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; offset = ECRYPTFS_FILE_SIZE_BYTES; - rc = contains_ecryptfs_marker(page_virt + offset); - if (rc == 0) { - rc = -EINVAL; + rc = ecryptfs_validate_marker(page_virt + offset); + if (rc) goto out; - } if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; @@ -1567,20 +1556,21 @@ out: return rc; } -int ecryptfs_read_and_validate_xattr_region(char *page_virt, - struct dentry *ecryptfs_dentry) +int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, + struct inode *inode) { + u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES]; + u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES; int rc; - rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry->d_inode); - if (rc) - goto out; - if (!contains_ecryptfs_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES)) { - printk(KERN_WARNING "Valid data found in [%s] xattr, but " - "the marker is invalid\n", ECRYPTFS_XATTR_NAME); - rc = -EINVAL; - } -out: + rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry), + ECRYPTFS_XATTR_NAME, file_size, + ECRYPTFS_SIZE_AND_MARKER_BYTES); + if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES) + return rc >= 0 ? -EINVAL : rc; + rc = ecryptfs_validate_marker(marker); + if (!rc) + ecryptfs_i_size_init(file_size, inode); return rc; } @@ -1610,7 +1600,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, mount_crypt_stat); /* Read the first page from the underlying file */ - page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER); + page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER); if (!page_virt) { rc = -ENOMEM; printk(KERN_ERR "%s: Unable to allocate page_virt\n", @@ -1655,7 +1645,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) out: if (page_virt) { memset(page_virt, 0, PAGE_CACHE_SIZE); - kmem_cache_free(ecryptfs_header_cache_1, page_virt); + kmem_cache_free(ecryptfs_header_cache, page_virt); } return rc; } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index e70282775e2c..43c7c43b06f5 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -200,6 +200,8 @@ ecryptfs_get_key_payload_data(struct key *key) #define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5 #define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */ #define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64)) +#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \ + + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 #define ECRYPTFS_DEFAULT_HASH "md5" @@ -603,8 +605,7 @@ extern struct kmem_cache *ecryptfs_file_info_cache; extern struct kmem_cache *ecryptfs_dentry_info_cache; extern struct kmem_cache *ecryptfs_inode_info_cache; extern struct kmem_cache *ecryptfs_sb_info_cache; -extern struct kmem_cache *ecryptfs_header_cache_1; -extern struct kmem_cache *ecryptfs_header_cache_2; +extern struct kmem_cache *ecryptfs_header_cache; extern struct kmem_cache *ecryptfs_xattr_cache; extern struct kmem_cache *ecryptfs_key_record_cache; extern struct kmem_cache *ecryptfs_key_sig_cache; @@ -625,14 +626,9 @@ struct ecryptfs_open_req { struct list_head kthread_ctl_list; }; -#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001 -int ecryptfs_interpose(struct dentry *hidden_dentry, - struct dentry *this_dentry, struct super_block *sb, - u32 flags); +struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb); void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); -int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, - struct dentry *lower_dentry, - struct inode *ecryptfs_dir_inode); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, size_t *decrypted_name_size, struct dentry *ecryptfs_dentry, @@ -664,10 +660,9 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); void ecryptfs_write_crypt_stat_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); -int ecryptfs_read_and_validate_header_region(char *data, - struct inode *ecryptfs_inode); -int ecryptfs_read_and_validate_xattr_region(char *page_virt, - struct dentry *ecryptfs_dentry); +int ecryptfs_read_and_validate_header_region(struct inode *inode); +int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, + struct inode *inode); u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); @@ -679,9 +674,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, unsigned char *src, struct dentry *ecryptfs_dentry); int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); -int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode); -int ecryptfs_inode_set(struct inode *inode, void *lower_inode); -void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode); ssize_t ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, void *value, size_t size); @@ -761,7 +753,7 @@ int ecryptfs_privileged_open(struct file **lower_file, struct dentry *lower_dentry, struct vfsmount *lower_mnt, const struct cred *cred); -int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry); +int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode); void ecryptfs_put_lower_file(struct inode *inode); int ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 566e5472f78c..4ec9eb00a241 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -191,7 +191,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - rc = ecryptfs_get_lower_file(ecryptfs_dentry); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index bc116b9ffcf2..7349ade17de6 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -51,6 +51,97 @@ static void unlock_dir(struct dentry *dir) dput(dir); } +static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) +{ + if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode) + return 1; + return 0; +} + +static int ecryptfs_inode_set(struct inode *inode, void *opaque) +{ + struct inode *lower_inode = opaque; + + ecryptfs_set_inode_lower(inode, lower_inode); + fsstack_copy_attr_all(inode, lower_inode); + /* i_size will be overwritten for encrypted regular files */ + fsstack_copy_inode_size(inode, lower_inode); + inode->i_ino = lower_inode->i_ino; + inode->i_version++; + inode->i_mapping->a_ops = &ecryptfs_aops; + + if (S_ISLNK(inode->i_mode)) + inode->i_op = &ecryptfs_symlink_iops; + else if (S_ISDIR(inode->i_mode)) + inode->i_op = &ecryptfs_dir_iops; + else + inode->i_op = &ecryptfs_main_iops; + + if (S_ISDIR(inode->i_mode)) + inode->i_fop = &ecryptfs_dir_fops; + else if (special_file(inode->i_mode)) + init_special_inode(inode, inode->i_mode, inode->i_rdev); + else + inode->i_fop = &ecryptfs_main_fops; + + return 0; +} + +static struct inode *__ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb) +{ + struct inode *inode; + + if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) + return ERR_PTR(-EXDEV); + if (!igrab(lower_inode)) + return ERR_PTR(-ESTALE); + inode = iget5_locked(sb, (unsigned long)lower_inode, + ecryptfs_inode_test, ecryptfs_inode_set, + lower_inode); + if (!inode) { + iput(lower_inode); + return ERR_PTR(-EACCES); + } + if (!(inode->i_state & I_NEW)) + iput(lower_inode); + + return inode; +} + +struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb) +{ + struct inode *inode = __ecryptfs_get_inode(lower_inode, sb); + + if (!IS_ERR(inode) && (inode->i_state & I_NEW)) + unlock_new_inode(inode); + + return inode; +} + +/** + * ecryptfs_interpose + * @lower_dentry: Existing dentry in the lower filesystem + * @dentry: ecryptfs' dentry + * @sb: ecryptfs's super_block + * + * Interposes upper and lower dentries. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_interpose(struct dentry *lower_dentry, + struct dentry *dentry, struct super_block *sb) +{ + struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + + return 0; +} + /** * ecryptfs_create_underlying_file * @lower_dir_inode: inode of the parent in the lower fs of the new file @@ -129,7 +220,7 @@ ecryptfs_do_create(struct inode *directory_inode, goto out_lock; } rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - directory_inode->i_sb, 0); + directory_inode->i_sb); if (rc) { ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); goto out_lock; @@ -168,7 +259,8 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) "context; rc = [%d]\n", rc); goto out; } - rc = ecryptfs_get_lower_file(ecryptfs_dentry); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, + ecryptfs_dentry->d_inode); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " @@ -215,102 +307,90 @@ out: return rc; } +static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) +{ + struct ecryptfs_crypt_stat *crypt_stat; + int rc; + + rc = ecryptfs_get_lower_file(dentry, inode); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + dentry->d_name.name, rc); + return rc; + } + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); + + rc = ecryptfs_read_and_validate_header_region(inode); + ecryptfs_put_lower_file(inode); + if (rc) { + rc = ecryptfs_read_and_validate_xattr_region(dentry, inode); + if (!rc) + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } + + /* Must return 0 to allow non-eCryptfs files to be looked up, too */ + return 0; +} + /** - * ecryptfs_lookup_and_interpose_lower - Perform a lookup + * ecryptfs_lookup_interpose - Dentry interposition for a lookup */ -int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, - struct dentry *lower_dentry, - struct inode *ecryptfs_dir_inode) +static int ecryptfs_lookup_interpose(struct dentry *dentry, + struct dentry *lower_dentry, + struct inode *dir_inode) { - struct dentry *lower_dir_dentry; + struct inode *inode, *lower_inode = lower_dentry->d_inode; + struct ecryptfs_dentry_info *dentry_info; struct vfsmount *lower_mnt; - struct inode *lower_inode; - struct ecryptfs_crypt_stat *crypt_stat; - char *page_virt = NULL; - int put_lower = 0, rc = 0; - - lower_dir_dentry = lower_dentry->d_parent; - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( - ecryptfs_dentry->d_parent)); - lower_inode = lower_dentry->d_inode; - fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); + int rc = 0; + + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); + fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); BUG_ON(!lower_dentry->d_count); - ecryptfs_set_dentry_private(ecryptfs_dentry, - kmem_cache_alloc(ecryptfs_dentry_info_cache, - GFP_KERNEL)); - if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) { - rc = -ENOMEM; + + dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + ecryptfs_set_dentry_private(dentry, dentry_info); + if (!dentry_info) { printk(KERN_ERR "%s: Out of memory whilst attempting " "to allocate ecryptfs_dentry_info struct\n", __func__); - goto out_put; + dput(lower_dentry); + mntput(lower_mnt); + d_drop(dentry); + return -ENOMEM; } - ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); - ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); + ecryptfs_set_dentry_lower(dentry, lower_dentry); + ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); + if (!lower_dentry->d_inode) { /* We want to add because we couldn't find in lower */ - d_add(ecryptfs_dentry, NULL); - goto out; - } - rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - ecryptfs_dir_inode->i_sb, - ECRYPTFS_INTERPOSE_FLAG_D_ADD); - if (rc) { - printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", - __func__, rc); - goto out; - } - if (S_ISDIR(lower_inode->i_mode)) - goto out; - if (S_ISLNK(lower_inode->i_mode)) - goto out; - if (special_file(lower_inode->i_mode)) - goto out; - /* Released in this function */ - page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); - if (!page_virt) { - printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n", - __func__); - rc = -ENOMEM; - goto out; + d_add(dentry, NULL); + return 0; } - rc = ecryptfs_get_lower_file(ecryptfs_dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize " - "the lower file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); - goto out_free_kmem; + inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb); + if (IS_ERR(inode)) { + printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n", + __func__, PTR_ERR(inode)); + return PTR_ERR(inode); } - put_lower = 1; - crypt_stat = &ecryptfs_inode_to_private( - ecryptfs_dentry->d_inode)->crypt_stat; - /* TODO: lock for crypt_stat comparison */ - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - rc = ecryptfs_read_and_validate_header_region(page_virt, - ecryptfs_dentry->d_inode); - if (rc) { - memset(page_virt, 0, PAGE_CACHE_SIZE); - rc = ecryptfs_read_and_validate_xattr_region(page_virt, - ecryptfs_dentry); + if (S_ISREG(inode->i_mode)) { + rc = ecryptfs_i_size_read(dentry, inode); if (rc) { - rc = 0; - goto out_free_kmem; + make_bad_inode(inode); + return rc; } - crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; } - ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); -out_free_kmem: - kmem_cache_free(ecryptfs_header_cache_2, page_virt); - goto out; -out_put: - dput(lower_dentry); - mntput(lower_mnt); - d_drop(ecryptfs_dentry); -out: - if (put_lower) - ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + d_add(dentry, inode); + return rc; } @@ -353,12 +433,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, goto out_d_drop; } if (lower_dentry->d_inode) - goto lookup_and_interpose; + goto interpose; mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; if (!(mount_crypt_stat && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) - goto lookup_and_interpose; + goto interpose; dput(lower_dentry); rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, @@ -381,9 +461,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, encrypted_and_encoded_name); goto out_d_drop; } -lookup_and_interpose: - rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, - ecryptfs_dir_inode); +interpose: + rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, + ecryptfs_dir_inode); goto out; out_d_drop: d_drop(ecryptfs_dentry); @@ -411,7 +491,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, lower_new_dentry); if (rc || !lower_new_dentry->d_inode) goto out_lock; - rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); + rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); if (rc) goto out_lock; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); @@ -478,7 +558,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, kfree(encoded_symname); if (rc || !lower_dentry->d_inode) goto out_lock; - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out_lock; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); @@ -502,7 +582,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); if (rc || !lower_dentry->d_inode) goto out; - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); @@ -521,8 +601,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) struct dentry *lower_dir_dentry; int rc; - dentry_unhash(dentry); - lower_dentry = ecryptfs_dentry_to_lower(dentry); dget(dentry); lower_dir_dentry = lock_parent(lower_dentry); @@ -552,7 +630,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); if (rc || !lower_dentry->d_inode) goto out; - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); @@ -575,9 +653,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dir_dentry; struct dentry *trap = NULL; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); dget(lower_old_dentry); @@ -755,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, lower_ia->ia_valid &= ~ATTR_SIZE; return 0; } - rc = ecryptfs_get_lower_file(dentry); + rc = ecryptfs_get_lower_file(dentry, inode); if (rc) return rc; crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; @@ -911,7 +986,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - rc = ecryptfs_get_lower_file(dentry); + rc = ecryptfs_get_lower_file(dentry, inode); if (rc) { mutex_unlock(&crypt_stat->cs_mutex); goto out; @@ -1084,21 +1159,6 @@ out: return rc; } -int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode) -{ - if ((ecryptfs_inode_to_lower(inode) - == (struct inode *)candidate_lower_inode)) - return 1; - else - return 0; -} - -int ecryptfs_inode_set(struct inode *inode, void *lower_inode) -{ - ecryptfs_init_inode(inode, (struct inode *)lower_inode); - return 0; -} - const struct inode_operations ecryptfs_symlink_iops = { .readlink = ecryptfs_readlink, .follow_link = ecryptfs_follow_link, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 89b93389af8e..9f1bb747d77d 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -135,12 +135,12 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, return rc; } -int ecryptfs_get_lower_file(struct dentry *dentry) +int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode) { - struct ecryptfs_inode_info *inode_info = - ecryptfs_inode_to_private(dentry->d_inode); + struct ecryptfs_inode_info *inode_info; int count, rc = 0; + inode_info = ecryptfs_inode_to_private(inode); mutex_lock(&inode_info->lower_file_mutex); count = atomic_inc_return(&inode_info->lower_file_count); if (WARN_ON_ONCE(count < 1)) @@ -168,75 +168,6 @@ void ecryptfs_put_lower_file(struct inode *inode) } } -static struct inode *ecryptfs_get_inode(struct inode *lower_inode, - struct super_block *sb) -{ - struct inode *inode; - int rc = 0; - - if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { - rc = -EXDEV; - goto out; - } - if (!igrab(lower_inode)) { - rc = -ESTALE; - goto out; - } - inode = iget5_locked(sb, (unsigned long)lower_inode, - ecryptfs_inode_test, ecryptfs_inode_set, - lower_inode); - if (!inode) { - rc = -EACCES; - iput(lower_inode); - goto out; - } - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - else - iput(lower_inode); - if (S_ISLNK(lower_inode->i_mode)) - inode->i_op = &ecryptfs_symlink_iops; - else if (S_ISDIR(lower_inode->i_mode)) - inode->i_op = &ecryptfs_dir_iops; - if (S_ISDIR(lower_inode->i_mode)) - inode->i_fop = &ecryptfs_dir_fops; - if (special_file(lower_inode->i_mode)) - init_special_inode(inode, lower_inode->i_mode, - lower_inode->i_rdev); - fsstack_copy_attr_all(inode, lower_inode); - /* This size will be overwritten for real files w/ headers and - * other metadata */ - fsstack_copy_inode_size(inode, lower_inode); - return inode; -out: - return ERR_PTR(rc); -} - -/** - * ecryptfs_interpose - * @lower_dentry: Existing dentry in the lower filesystem - * @dentry: ecryptfs' dentry - * @sb: ecryptfs's super_block - * @flags: flags to govern behavior of interpose procedure - * - * Interposes upper and lower dentries. - * - * Returns zero on success; non-zero otherwise - */ -int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, - struct super_block *sb, u32 flags) -{ - struct inode *lower_inode = lower_dentry->d_inode; - struct inode *inode = ecryptfs_get_inode(lower_inode, sb); - if (IS_ERR(inode)) - return PTR_ERR(inode); - if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) - d_add(dentry, inode); - else - d_instantiate(dentry, inode); - return 0; -} - enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, @@ -704,13 +635,8 @@ static struct ecryptfs_cache_info { .size = sizeof(struct ecryptfs_sb_info), }, { - .cache = &ecryptfs_header_cache_1, - .name = "ecryptfs_headers_1", - .size = PAGE_CACHE_SIZE, - }, - { - .cache = &ecryptfs_header_cache_2, - .name = "ecryptfs_headers_2", + .cache = &ecryptfs_header_cache, + .name = "ecryptfs_headers", .size = PAGE_CACHE_SIZE, }, { diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 245b517bf1b6..dbd52d40df4c 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -93,22 +93,6 @@ static void ecryptfs_destroy_inode(struct inode *inode) } /** - * ecryptfs_init_inode - * @inode: The ecryptfs inode - * - * Set up the ecryptfs inode. - */ -void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode) -{ - ecryptfs_set_inode_lower(inode, lower_inode); - inode->i_ino = lower_inode->i_ino; - inode->i_version++; - inode->i_op = &ecryptfs_main_iops; - inode->i_fop = &ecryptfs_main_fops; - inode->i_mapping->a_ops = &ecryptfs_aops; -} - -/** * ecryptfs_statfs * @sb: The ecryptfs super block * @buf: The struct kstatfs to fill in with stats diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 68b2e43d7c35..3451d23c3bae 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3392,7 +3392,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ -void ext3_dirty_inode(struct inode *inode) +void ext3_dirty_inode(struct inode *inode, int flags) { handle_t *current_handle = ext3_journal_current_handle(); handle_t *handle; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a74b89c09f90..1921392cd708 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1813,7 +1813,7 @@ extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_sync_inode(handle_t *, struct inode *); -extern void ext4_dirty_inode(struct inode *); +extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 50d0e9c64584..a5763e3505ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5733,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ -void ext4_dirty_inode(struct inode *inode) +void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index be15437c272e..3b222dafd15b 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) struct fat_slot_info sinfo; int err; - dentry_unhash(dentry); - lock_super(sb); /* * Check whether the directory is not in use, then check @@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - err = fat_scan(old_dir, old_name, &old_sinfo); if (err) { err = -EIO; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c61a6789f36c..20b4ea53fdc4 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) struct fat_slot_info sinfo; int err; - dentry_unhash(dentry); - lock_super(sb); err = fat_dir_empty(inode); @@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 34591ee804b5..0f015a0468de 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1007,9 +1007,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) * In short, make sure you hash any inodes _before_ you start marking * them dirty. * - * This function *must* be atomic for the I_DIRTY_PAGES case - - * set_page_dirty() is called under spinlock in several places. - * * Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of * the kernel-internal blockdev inode represents the dirtying time of the @@ -1028,7 +1025,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); + sb->s_op->dirty_inode(inode, flags); } /* diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0d0e3faddcfa..d50160714595 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (IS_ERR(req)) return PTR_ERR(req); - dentry_unhash(entry); - req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); req->in.numargs = 1; @@ -694,9 +692,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, struct fuse_conn *fc = get_fuse_conn(olddir); struct fuse_req *req = fuse_get_req(fc); - if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode)) - dentry_unhash(newent); - if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 1cb70cdba2c1..b4d70b13be92 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int res; - if (S_ISDIR(inode->i_mode)) - dentry_unhash(dentry); - if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); @@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - res = hfs_remove(new_dir, new_dentry); if (res) return res; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index b28835091dd0..4df5059c25da 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int res; - dentry_unhash(dentry); - if (inode->i_size != 2) return -ENOTEMPTY; @@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) { - dentry_unhash(new_dentry); + if (S_ISDIR(new_dentry->d_inode->i_mode)) res = hfsplus_rmdir(new_dir, new_dentry); - } else { + else res = hfsplus_unlink(new_dir, new_dentry); - } if (res) return res; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e6816b9e6903..2638c834ed28 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry) char *file; int err; - dentry_unhash(dentry); - if ((file = dentry_name(dentry)) == NULL) return -ENOMEM; err = do_rmdir(file); @@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, char *from_name, *to_name; int err; - if (to->d_inode && S_ISDIR(to->d_inode->i_mode)) - dentry_unhash(to); - if ((from_name = dentry_name(from)) == NULL) return -ENOMEM; if ((to_name = dentry_name(to)) == NULL) { diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index ff0ce21c0867..acf95dab2aac 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -439,8 +439,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) int err; int r; - dentry_unhash(dentry); - hpfs_adjust_length(name, &len); hpfs_lock(dir->i_sb); err = -ENOENT; @@ -535,9 +533,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct fnode *fnode; int err; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - if ((err = hpfs_chk_name(new_name, &new_len))) return err; err = 0; hpfs_adjust_length(old_name, &old_len); diff --git a/fs/inode.c b/fs/inode.c index 990d284877a1..0f7e88a7803f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1,9 +1,7 @@ /* - * linux/fs/inode.c - * * (C) 1997 Linus Torvalds + * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) */ - #include <linux/fs.h> #include <linux/mm.h> #include <linux/dcache.h> @@ -27,10 +25,11 @@ #include <linux/prefetch.h> #include <linux/ima.h> #include <linux/cred.h> +#include <linux/buffer_head.h> /* for inode_has_buffers */ #include "internal.h" /* - * inode locking rules. + * Inode locking rules: * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() @@ -60,54 +59,11 @@ * inode_hash_lock */ -/* - * This is needed for the following functions: - * - inode_has_buffers - * - invalidate_bdev - * - * FIXME: remove all knowledge of the buffer layer from this file - */ -#include <linux/buffer_head.h> - -/* - * New inode.c implementation. - * - * This implementation has the basic premise of trying - * to be extremely low-overhead and SMP-safe, yet be - * simple enough to be "obviously correct". - * - * Famous last words. - */ - -/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */ - -/* #define INODE_PARANOIA 1 */ -/* #define INODE_DEBUG 1 */ - -/* - * Inode lookup is no longer as critical as it used to be: - * most of the lookups are going to be through the dcache. - */ -#define I_HASHBITS i_hash_shift -#define I_HASHMASK i_hash_mask - static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -/* - * Each inode can be on two separate lists. One is - * the hash list of the inode, used for lookups. The - * other linked list is the "type" list: - * "in_use" - valid inode, i_count > 0, i_nlink > 0 - * "dirty" - as "in_use" but also dirty - * "unused" - valid inode, i_count = 0 - * - * A "dirty" list is maintained for each super block, - * allowing for low-overhead inode sync() operations. - */ - static LIST_HEAD(inode_lru); static DEFINE_SPINLOCK(inode_lru_lock); @@ -424,8 +380,8 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / L1_CACHE_BYTES; - tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); - return tmp & I_HASHMASK; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); + return tmp & i_hash_mask; } /** diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 9a1e86fc1362..4bca6a2e5c07 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -605,8 +605,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) int ret; uint32_t now = get_seconds(); - dentry_unhash(dentry); - for (fd = f->dents ; fd; fd = fd->next) { if (fd->ino) return -ENOTEMPTY; @@ -782,9 +780,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, uint8_t type; uint32_t now; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - /* The VFS will check for us and prevent trying to rename a * file over a directory and vice versa, but if it's a directory, * the VFS can't check whether the victim is empty. The filesystem diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index e896e67767eb..46ad619b6124 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -357,7 +357,7 @@ error: return ERR_PTR(ret); } -void jffs2_dirty_inode(struct inode *inode) +void jffs2_dirty_inode(struct inode *inode, int flags) { struct iattr iattr; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 00bae7cc2e48..65c6c43ca482 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -172,7 +172,7 @@ int jffs2_setattr (struct dentry *, struct iattr *); int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); -void jffs2_dirty_inode(struct inode *inode); +void jffs2_dirty_inode(struct inode *inode, int flags); struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index eddbb373209e..109655904bbc 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -173,7 +173,7 @@ void jfs_evict_inode(struct inode *inode) dquot_drop(inode); } -void jfs_dirty_inode(struct inode *inode) +void jfs_dirty_inode(struct inode *inode, int flags) { static int noisy = 5; diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 155e91eff07d..ec2fb8b945fc 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -28,7 +28,7 @@ extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); extern int jfs_write_inode(struct inode *, struct writeback_control *); extern void jfs_evict_inode(struct inode *); -extern void jfs_dirty_inode(struct inode *); +extern void jfs_dirty_inode(struct inode *, int); extern void jfs_truncate(struct inode *); extern void jfs_truncate_nolock(struct inode *, loff_t); extern void jfs_free_zero_link(struct inode *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 865df16a6cf3..eaaf2b511e89 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); - dentry_unhash(dentry); - /* Init inode for quota operations. */ dquot_initialize(dip); dquot_initialize(ip); @@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - dquot_initialize(old_dir); dquot_initialize(new_dir); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index f34c9cde9e94..9ed89d1663f8 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; - dentry_unhash(dentry); - if (!logfs_empty_dir(inode)) return -ENOTEMPTY; @@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, loff_t pos; int err; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - /* 1. locate source dd */ err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); if (err) diff --git a/fs/minix/namei.c b/fs/minix/namei.c index f60aed8db9c4..6e6777f1b4b2 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) struct inode * inode = dentry->d_inode; int err = -ENOTEMPTY; - dentry_unhash(dentry); - if (minix_empty_dir(inode)) { err = minix_unlink(dir, dentry); if (!err) { @@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct minix_dir_entry * old_de; int err = -ENOENT; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - old_de = minix_find_entry(old_dentry, &old_page); if (!old_de) goto out; diff --git a/fs/namei.c b/fs/namei.c index 2358b326b221..e2e4e8d032ee 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -919,12 +919,11 @@ static inline bool managed_dentry_might_block(struct dentry *dentry) } /* - * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we - * meet a managed dentry and we're not walking to "..". True is returned to - * continue, false to abort. + * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if + * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode, bool reverse_transit) + struct inode **inode) { for (;;) { struct vfsmount *mounted; @@ -933,8 +932,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * that wants to block transit. */ *inode = path->dentry->d_inode; - if (!reverse_transit && - unlikely(managed_dentry_might_block(path->dentry))) + if (unlikely(managed_dentry_might_block(path->dentry))) return false; if (!d_mountpoint(path->dentry)) @@ -947,16 +945,24 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->dentry = mounted->mnt_root; nd->seq = read_seqcount_begin(&path->dentry->d_seq); } - - if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - return reverse_transit; return true; } -static int follow_dotdot_rcu(struct nameidata *nd) +static void follow_mount_rcu(struct nameidata *nd) { - struct inode *inode = nd->inode; + while (d_mountpoint(nd->path.dentry)) { + struct vfsmount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); + if (!mounted) + break; + nd->path.mnt = mounted; + nd->path.dentry = mounted->mnt_root; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + } +} +static int follow_dotdot_rcu(struct nameidata *nd) +{ set_root_rcu(nd); while (1) { @@ -972,7 +978,6 @@ static int follow_dotdot_rcu(struct nameidata *nd) seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) goto failed; - inode = parent->d_inode; nd->path.dentry = parent; nd->seq = seq; break; @@ -980,10 +985,9 @@ static int follow_dotdot_rcu(struct nameidata *nd) if (!follow_up_rcu(&nd->path)) break; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - inode = nd->path.dentry->d_inode; } - __follow_mount_rcu(nd, &nd->path, &inode, true); - nd->inode = inode; + follow_mount_rcu(nd); + nd->inode = nd->path.dentry->d_inode; return 0; failed: @@ -1157,8 +1161,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, } path->mnt = mnt; path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode, false))) - return 0; + if (unlikely(!__follow_mount_rcu(nd, path, inode))) + goto unlazy; + if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + goto unlazy; + return 0; unlazy: if (unlazy_walk(nd, dentry)) return -ECHILD; @@ -2572,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (error) goto out; + shrink_dcache_parent(dentry); error = dir->i_op->rmdir(dir, dentry); if (error) goto out; @@ -2986,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; + if (target) + shrink_dcache_parent(new_dentry); error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); if (error) goto out; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index e3e646b06404..9c51f621e901 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1033,8 +1033,11 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) DPRINTK("ncp_rmdir: removing %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); + /* + * fail with EBUSY if there are still references to this + * directory. + */ dentry_unhash(dentry); - error = -EBUSY; if (!d_unhashed(dentry)) goto out; @@ -1141,8 +1144,16 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name); - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) { + /* + * fail with EBUSY if there are still references to this + * directory. + */ dentry_unhash(new_dentry); + error = -EBUSY; + if (!d_unhashed(new_dentry)) + goto out; + } ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index ba306658a6db..81515545ba75 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -87,6 +87,16 @@ config NFS_V4_1 config PNFS_FILE_LAYOUT tristate +config PNFS_OBJLAYOUT + tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" + depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD + help + Say M here if you want your pNFS client to support the Objects Layout Driver. + Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and + upper level driver (SCSI_OSD_ULD). + + If unsure, say N. + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 4776ff9e3814..6a34f7dd0e6f 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o -nfs-$(CONFIG_NFS_V4_1) += pnfs.o +nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o + +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 46d93ce7311b..b257383bb565 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall( extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); extern void nfs4_cb_take_slot(struct nfs_client *clp); + +struct cb_devicenotifyitem { + uint32_t cbd_notify_type; + uint32_t cbd_layout_type; + struct nfs4_deviceid cbd_dev_id; + uint32_t cbd_immediate; +}; + +struct cb_devicenotifyargs { + int ndevs; + struct cb_devicenotifyitem *devs; +}; + +extern __be32 nfs4_callback_devicenotify( + struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps); + #endif /* CONFIG_NFS_V4_1 */ extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2f41dccea18e..d4d1954e9bb9 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || mark_matching_lsegs_invalid(lo, &free_me_list, - args->cbl_range.iomode)) + &args->cbl_range)) rv = NFS4ERR_DELAY; else rv = NFS4ERR_NOMATCHING_LAYOUT; @@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, ino = lo->plh_inode; spin_lock(&ino->i_lock); set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) + if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) rv = NFS4ERR_DELAY; list_del_init(&lo->plh_bulk_recall); spin_unlock(&ino->i_lock); @@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp) do_callback_layoutrecall(clp, &args); } +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps) +{ + int i; + __be32 res = 0; + struct nfs_client *clp = cps->clp; + struct nfs_server *server = NULL; + + dprintk("%s: -->\n", __func__); + + if (!clp) { + res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + goto out; + } + + for (i = 0; i < args->ndevs; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + if (!server || + server->pnfs_curr_ld->id != dev->cbd_layout_type) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (server->pnfs_curr_ld && + server->pnfs_curr_ld->id == dev->cbd_layout_type) { + rcu_read_unlock(); + goto found; + } + rcu_read_unlock(); + dprintk("%s: layout type %u not found\n", + __func__, dev->cbd_layout_type); + continue; + } + + found: + if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) + dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " + "deleting instead\n", __func__); + nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + } + +out: + kfree(args->devs); + dprintk("%s: exit with status = %u\n", + __func__, be32_to_cpu(res)); + return res; +} + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) { if (delegation == NULL) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 00ecf62ce7c1..c6c86a77e043 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -25,6 +25,7 @@ #if defined(CONFIG_NFS_V4_1) #define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 4 + 1 + 3) #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -284,6 +285,93 @@ out: return status; } +static +__be32 decode_devicenotify_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_devicenotifyargs *args) +{ + __be32 *p; + __be32 status = 0; + u32 tmp; + int n, i; + args->ndevs = 0; + + /* Num of device notifications */ + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + n = ntohl(*p++); + if (n <= 0) + goto out; + + args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); + if (!args->devs) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + /* Decode each dev notification */ + for (i = 0; i < n; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + + tmp = ntohl(*p++); /* bitmap size */ + if (tmp != 1) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_notify_type = ntohl(*p++); + if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && + dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + + tmp = ntohl(*p++); /* opaque size */ + if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && + (tmp != NFS4_DEVICEID4_SIZE + 8)) || + ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && + (tmp != NFS4_DEVICEID4_SIZE + 4))) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_layout_type = ntohl(*p++); + memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + dev->cbd_immediate = ntohl(*p++); + } else { + dev->cbd_immediate = 0; + } + + args->ndevs++; + + dprintk("%s: type %d layout 0x%x immediate %d\n", + __func__, dev->cbd_notify_type, dev->cbd_layout_type, + dev->cbd_immediate); + } +out: + dprintk("%s: status %d ndevs %d\n", + __func__, ntohl(status), args->ndevs); + return status; +err: + kfree(args->devs); + goto out; +} + static __be32 decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) { @@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_RECALL_ANY: case OP_CB_RECALL_SLOT: case OP_CB_LAYOUTRECALL: + case OP_CB_NOTIFY_DEVICEID: *op = &callback_ops[op_nr]; break; - case OP_CB_NOTIFY_DEVICEID: case OP_CB_NOTIFY: case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: @@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = { (callback_decode_arg_t)decode_layoutrecall_args, .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, }, + [OP_CB_NOTIFY_DEVICEID] = { + .process_op = (callback_process_op_t)nfs4_callback_devicenotify, + .decode_args = + (callback_decode_arg_t)decode_devicenotify_args, + .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, + }, [OP_CB_SEQUENCE] = { .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 139be9647d80..b3dc2b88b65b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp) if (clp->cl_machine_cred != NULL) put_rpccred(clp->cl_machine_cred); + nfs4_deviceid_purge_client(clp); + kfree(clp->cl_hostname); kfree(clp); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index bbbc6bf5cb2e..dd25c2aec375 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -21,25 +21,13 @@ #include "delegation.h" #include "internal.h" -static void nfs_do_free_delegation(struct nfs_delegation *delegation) -{ - kfree(delegation); -} - -static void nfs_free_delegation_callback(struct rcu_head *head) -{ - struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu); - - nfs_do_free_delegation(delegation); -} - static void nfs_free_delegation(struct nfs_delegation *delegation) { if (delegation->cred) { put_rpccred(delegation->cred); delegation->cred = NULL; } - call_rcu(&delegation->rcu, nfs_free_delegation_callback); + kfree_rcu(delegation, rcu); } /** diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 424e47773a84..ededdbd0db38 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct page **xdr_pages, struct page *page, unsigned int buflen) { struct xdr_stream stream; - struct xdr_buf buf = { - .pages = xdr_pages, - .page_len = buflen, - .buflen = buflen, - .len = buflen, - }; + struct xdr_buf buf; struct page *scratch; struct nfs_cache_array *array; unsigned int count = 0; @@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (scratch == NULL) return -ENOMEM; - xdr_init_decode(&stream, &buf, NULL); + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); do { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 57bb31ad7a5e..144f2a3c7185 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1298,8 +1298,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } - dprintk("NFS: isize change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); + dprintk("NFS: isize change on server for file %s/%ld " + "(%Ld to %Ld)\n", + inode->i_sb->s_id, + inode->i_ino, + (long long)cur_isize, + (long long)new_isize); } } else invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR @@ -1424,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ void nfs4_evict_inode(struct inode *inode) { - pnfs_destroy_layout(NFS_I(inode)); truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); /* First call standard NFS clear_inode() code */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2df6ca7b5898..b9056cbe68d6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *, #endif /* nfs4proc.c */ +extern void __nfs4_read_done_cb(struct nfs_read_data *); extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); extern int nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index be79dc9f386d..426908809c97 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, struct nfs4_deviceid *id, gfp_t gfp_flags) { + struct nfs4_deviceid_node *d; struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); @@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); if (fl->pattern_offset > lgr->range.offset) { - dprintk("%s pattern_offset %lld to large\n", + dprintk("%s pattern_offset %lld too large\n", __func__, fl->pattern_offset); goto out; } @@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, } /* find and reference the deviceid */ - dsaddr = nfs4_fl_find_get_deviceid(id); - if (dsaddr == NULL) { + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, + NFS_SERVER(lo->plh_inode)->nfs_client, id); + if (d == NULL) { dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); if (dsaddr == NULL) goto out; - } + } else + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); fl->dsaddr = dsaddr; if (fl->first_stripe_index < 0 || @@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, gfp_t gfp_flags) { struct xdr_stream stream; - struct xdr_buf buf = { - .pages = lgr->layoutp->pages, - .page_len = lgr->layoutp->len, - .buflen = lgr->layoutp->len, - .len = lgr->layoutp->len, - }; + struct xdr_buf buf; struct page *scratch; __be32 *p; uint32_t nfl_util; @@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, if (!scratch) return -ENOMEM; - xdr_init_decode(&stream, &buf, NULL); + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), @@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, memcpy(id, p, sizeof(*id)); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - print_deviceid(id); + nfs4_print_deviceid(id); nfl_util = be32_to_cpup(p++); if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) @@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, /* * filelayout_pg_test(). Called by nfs_can_coalesce_requests() * - * return 1 : coalesce page - * return 0 : don't coalesce page + * return true : coalesce page + * return false : don't coalesce page */ -int +bool filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { u64 p_stripe, r_stripe; u32 stripe_unit; + if (!pnfs_generic_pg_test(pgio, prev, req)) + return 0; + if (!pgio->pg_lseg) return 1; p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; @@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, return -ENOMEM; } +static void +filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +{ + nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, + .free_deviceid_node = filelayout_free_deveiceid_node, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 2b461d77b43a..cebe01e3795e 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -59,9 +59,7 @@ struct nfs4_pnfs_ds { #define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 struct nfs4_file_layout_dsaddr { - struct hlist_node node; - struct nfs4_deviceid deviceid; - atomic_t ref; + struct nfs4_deviceid_node id_node; unsigned long flags; u32 stripe_count; u8 *stripe_indices; @@ -95,14 +93,12 @@ extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); extern void print_ds(struct nfs4_pnfs_ds *ds); -extern void print_deviceid(struct nfs4_deviceid *dev_id); u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx); -extern struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id); extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); +extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index db07c7af1395..3b7bf1377264 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -37,30 +37,6 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD /* - * Device ID RCU cache. A device ID is unique per client ID and layout type. - */ -#define NFS4_FL_DEVICE_ID_HASH_BITS 5 -#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS) -#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1) - -static inline u32 -nfs4_fl_deviceid_hash(struct nfs4_deviceid *id) -{ - unsigned char *cptr = (unsigned char *)id->data; - unsigned int nbytes = NFS4_DEVICEID4_SIZE; - u32 x = 0; - - while (nbytes--) { - x *= 37; - x += *cptr++; - } - return x & NFS4_FL_DEVICE_ID_HASH_MASK; -} - -static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE]; -static DEFINE_SPINLOCK(filelayout_deviceid_lock); - -/* * Data server cache * * Data servers can be mapped to different device ids. @@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds) ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } -void -print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) -{ - int i; - - ifdebug(FACILITY) { - printk("%s dsaddr->ds_num %d\n", __func__, - dsaddr->ds_num); - for (i = 0; i < dsaddr->ds_num; i++) - print_ds(dsaddr->ds_list[i]); - } -} - -void print_deviceid(struct nfs4_deviceid *id) -{ - u32 *p = (u32 *)id; - - dprintk("%s: device id= [%x%x%x%x]\n", __func__, - p[0], p[1], p[2], p[3]); -} - /* nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * _data_server_lookup_locked(u32 ip_addr, u32 port) @@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds) kfree(ds); } -static void +void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) { struct nfs4_pnfs_ds *ds; int i; - print_deviceid(&dsaddr->deviceid); + nfs4_print_deviceid(&dsaddr->id_node.deviceid); for (i = 0; i < dsaddr->ds_num; i++) { ds = dsaddr->ds_list[i]; @@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) u8 max_stripe_index; struct nfs4_file_layout_dsaddr *dsaddr = NULL; struct xdr_stream stream; - struct xdr_buf buf = { - .pages = pdev->pages, - .page_len = pdev->pglen, - .buflen = pdev->pglen, - .len = pdev->pglen, - }; + struct xdr_buf buf; struct page *scratch; /* set up xdr stream */ @@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) if (!scratch) goto out_err; - xdr_init_decode(&stream, &buf, NULL); + xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); /* Get the stripe count (number of stripe index) */ @@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) dsaddr->stripe_indices = stripe_indices; stripe_indices = NULL; dsaddr->ds_num = num; - - memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); + nfs4_init_deviceid_node(&dsaddr->id_node, + NFS_SERVER(ino)->pnfs_curr_ld, + NFS_SERVER(ino)->nfs_client, + &pdev->dev_id); for (i = 0; i < dsaddr->ds_num; i++) { int j; @@ -505,8 +457,8 @@ out_err: static struct nfs4_file_layout_dsaddr * decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) { - struct nfs4_file_layout_dsaddr *d, *new; - long hash; + struct nfs4_deviceid_node *d; + struct nfs4_file_layout_dsaddr *n, *new; new = decode_device(inode, dev, gfp_flags); if (!new) { @@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl return NULL; } - spin_lock(&filelayout_deviceid_lock); - d = nfs4_fl_find_get_deviceid(&new->deviceid); - if (d) { - spin_unlock(&filelayout_deviceid_lock); + d = nfs4_insert_deviceid_node(&new->id_node); + n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + if (n != new) { nfs4_fl_free_deviceid(new); - return d; + return n; } - INIT_HLIST_NODE(&new->node); - atomic_set(&new->ref, 1); - hash = nfs4_fl_deviceid_hash(&new->deviceid); - hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]); - spin_unlock(&filelayout_deviceid_lock); - return new; } @@ -600,35 +545,7 @@ out_free: void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) { - if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { - hlist_del_rcu(&dsaddr->node); - spin_unlock(&filelayout_deviceid_lock); - - synchronize_rcu(); - nfs4_fl_free_deviceid(dsaddr); - } -} - -struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id) -{ - struct nfs4_file_layout_dsaddr *d; - struct hlist_node *n; - long hash = nfs4_fl_deviceid_hash(id); - - - rcu_read_lock(); - hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) { - if (!memcmp(&d->deviceid, id, sizeof(*id))) { - if (!atomic_inc_not_zero(&d->ref)) - goto fail; - rcu_read_unlock(); - return d; - } - } -fail: - rcu_read_unlock(); - return NULL; + nfs4_put_deviceid_node(&dsaddr->id_node); } /* @@ -676,15 +593,15 @@ static void filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, int err, u32 ds_addr) { - u32 *p = (u32 *)&dsaddr->deviceid; + u32 *p = (u32 *)&dsaddr->id_node.deviceid; printk(KERN_ERR "NFS: data server %x connection error %d." " Deviceid [%x%x%x%x] marked out of use.\n", ds_addr, err, p[0], p[1], p[2], p[3]); - spin_lock(&filelayout_deviceid_lock); + spin_lock(&nfs4_ds_cache_lock); dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; - spin_unlock(&filelayout_deviceid_lock); + spin_unlock(&nfs4_ds_cache_lock); } struct nfs4_pnfs_ds * diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cf1b339c3937..d2c4b59c896d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -267,9 +267,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc break; nfs4_schedule_stateid_recovery(server, state); goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) + nfs4_schedule_stateid_recovery(server, state); case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) @@ -2361,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct nfs4_state *state = NULL; int status; + if (pnfs_ld_layoutret_on_setattr(inode)) + pnfs_return_layout(inode); + nfs_fattr_init(fattr); /* Search for an existing open(O_WRITE) file */ @@ -3175,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +void __nfs4_read_done_cb(struct nfs_read_data *data) +{ + nfs_invalidate_atime(data->inode); +} + static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->inode); @@ -3184,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) return -EAGAIN; } - nfs_invalidate_atime(data->inode); + __nfs4_read_done_cb(data); if (task->tk_status > 0) renew_lease(server, data->timestamp); return 0; @@ -3198,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; - return data->read_done_cb(task, data); + return data->read_done_cb ? data->read_done_cb(task, data) : + nfs4_read_done_cb(task, data); } static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) @@ -3243,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; - return data->write_done_cb(task, data); + return data->write_done_cb ? data->write_done_cb(task, data) : + nfs4_write_done_cb(task, data); } /* Reset the the nfs_write_data to send the write to the MDS. */ @@ -3670,9 +3682,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, break; nfs4_schedule_stateid_recovery(server, state); goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) + nfs4_schedule_stateid_recovery(server, state); case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(clp); goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) @@ -4543,6 +4557,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -ESTALE: goto out; case -NFS4ERR_EXPIRED: + nfs4_schedule_stateid_recovery(server, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: nfs4_schedule_lease_recovery(server->nfs_client); @@ -5666,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) return status; } +static void +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + + dprintk("--> %s\n", __func__); + if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, + &lrp->res.seq_res, 0, task)) + return; + rpc_call_start(task); +} + +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct nfs_server *server; + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lrp->res.seq_res)) + return; + + server = NFS_SERVER(lrp->args.inode); + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + nfs_restart_rpc(task, lrp->clp); + return; + } + if (task->tk_status == 0) { + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; + + if (lrp->res.lrs_present) { + spin_lock(&lo->plh_inode->i_lock); + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + spin_unlock(&lo->plh_inode->i_lock); + } else + BUG_ON(!list_empty(&lo->plh_segs)); + } + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutreturn_release(void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + + dprintk("--> %s\n", __func__); + put_layout_hdr(NFS_I(lrp->args.inode)->layout); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { + .rpc_call_prepare = nfs4_layoutreturn_prepare, + .rpc_call_done = nfs4_layoutreturn_done, + .rpc_release = nfs4_layoutreturn_release, +}; + +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], + .rpc_argp = &lrp->args, + .rpc_resp = &lrp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = lrp->clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutreturn_call_ops, + .callback_data = lrp, + }; + int status; + + dprintk("--> %s\n", __func__); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + dprintk("<-- %s status=%d\n", __func__, status); + rpc_put_task(task); + return status; +} + static int _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 036f5adc9e1f..e97dd219f84f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1466,7 +1466,10 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) #ifdef CONFIG_NFS_V4_1 void nfs4_schedule_session_recovery(struct nfs4_session *session) { - nfs4_schedule_lease_recovery(session->clp); + struct nfs_client *clp = session->clp; + + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + nfs4_schedule_lease_recovery(clp); } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); @@ -1549,6 +1552,7 @@ static int nfs4_reset_session(struct nfs_client *clp) status = nfs4_recovery_handle_error(clp, status); goto out; } + clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* create_session negotiated new slot table */ clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c3ccd2c46834..d869a5e5464b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int); 1 /* layoutupdate4 layout type */ + \ 1 /* NULL filelayout layoutupdate4 payload */) #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) - +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + 1 /* FIXME: opaque lrf_body always empty at the moment */) +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ + 1 + decode_stateid_maxsz) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int); decode_putfh_maxsz + \ decode_layoutcommit_maxsz + \ decode_getattr_maxsz) - +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutreturn_maxsz) +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutreturn_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr, static int encode_layoutcommit(struct xdr_stream *xdr, + struct inode *inode, const struct nfs4_layoutcommit_args *args, struct compound_hdr *hdr) { @@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr, dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, NFS_SERVER(args->inode)->pnfs_curr_ld->id); - p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); /* Only whole file layouts */ p = xdr_encode_hyper(p, 0); /* offset */ @@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr, p = xdr_encode_hyper(p, args->lastbytewritten); *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - *p++ = cpu_to_be32(0); /* no file layout payload */ + + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) + NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( + NFS_I(inode)->layout, xdr, args); + else { + p = reserve_space(xdr, 4); + *p = cpu_to_be32(0); /* no layout-type payload */ + } hdr->nops++; hdr->replen += decode_layoutcommit_maxsz; return 0; } + +static void +encode_layoutreturn(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(OP_LAYOUTRETURN); + *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ + *p++ = cpu_to_be32(args->layout_type); + *p++ = cpu_to_be32(IOMODE_ANY); + *p = cpu_to_be32(RETURN_FILE); + p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, 0); + p = xdr_encode_hyper(p, NFS4_MAX_UINT64); + spin_lock(&args->inode->i_lock); + xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); + spin_unlock(&args->inode->i_lock); + if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { + NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( + NFS_I(args->inode)->layout, xdr, args); + } else { + p = reserve_space(xdr, 4); + *p = cpu_to_be32(0); + } + hdr->nops++; + hdr->replen += decode_layoutreturn_maxsz; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, /* * Encode LAYOUTCOMMIT request */ -static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, - struct xdr_stream *xdr, - struct nfs4_layoutcommit_args *args) +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_args *args) { + struct nfs4_layoutcommit_data *data = + container_of(args, struct nfs4_layoutcommit_data, args); struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; @@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, NFS_FH(args->inode), &hdr); - encode_layoutcommit(xdr, args, &hdr); + encode_layoutcommit(xdr, data->args.inode, args, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); - return 0; +} + +/* + * Encode LAYOUTRETURN request + */ +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutreturn(xdr, args, &hdr); + encode_nops(&hdr); } #endif /* CONFIG_NFS_V4_1 */ @@ -5203,6 +5271,27 @@ out_overflow: return -EIO; } +static int decode_layoutreturn(struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTRETURN); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->lrs_present = be32_to_cpup(p); + if (res->lrs_present) + status = decode_stateid(xdr, &res->stateid); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_layoutcommit(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_layoutcommit_res *res) @@ -6320,6 +6409,30 @@ out: } /* + * Decode LAYOUTRETURN response + */ +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutreturn(xdr, res); +out: + return status; +} + +/* * Decode LAYOUTCOMMIT response */ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, @@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), PROC(LAYOUTGET, enc_layoutget, dec_layoutget), PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index c541093a5bf2..c4744e1d513c 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -87,7 +87,7 @@ #define NFS_ROOT "/tftpboot/%s" /* Default NFSROOT mount options. */ -#define NFS_DEF_OPTIONS "udp" +#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" /* Parameters passed from the kernel command line */ static char nfs_root_parms[256] __initdata = ""; diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild new file mode 100644 index 000000000000..ed30ea072bb8 --- /dev/null +++ b/fs/nfs/objlayout/Kbuild @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module +# +objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c new file mode 100644 index 000000000000..9cf208df1f25 --- /dev/null +++ b/fs/nfs/objlayout/objio_osd.c @@ -0,0 +1,1057 @@ +/* + * pNFS Objects layout implementation over open-osd initiator library + * + * Copyright (C) 2009 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <scsi/osd_initiator.h> + +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +#define _LLU(x) ((unsigned long long)x) + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), +}; + +struct objio_dev_ent { + struct nfs4_deviceid_node id_node; + struct osd_dev *od; +}; + +static void +objio_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); + + dprintk("%s: free od=%p\n", __func__, de->od); + osduld_put_device(de->od); + kfree(de); +} + +static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de; + + d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); + if (!d) + return NULL; + + de = container_of(d, struct objio_dev_ent, id_node); + return de; +} + +static struct objio_dev_ent * +_dev_list_add(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id, struct osd_dev *od, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); + struct objio_dev_ent *n; + + if (!de) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + return NULL; + } + + dprintk("%s: Adding od=%p\n", __func__, od); + nfs4_init_deviceid_node(&de->id_node, + nfss->pnfs_curr_ld, + nfss->nfs_client, + d_id); + de->od = od; + + d = nfs4_insert_deviceid_node(&de->id_node); + n = container_of(d, struct objio_dev_ent, id_node); + if (n != de) { + dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + objio_free_deviceid_node(&de->id_node); + de = n; + } + + atomic_inc(&de->id_node.ref); + return de; +} + +struct caps_buffers { + u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; + u8 creds[OSD_CAP_LEN]; +}; + +struct objio_segment { + struct pnfs_layout_segment lseg; + + struct pnfs_osd_object_cred *comps; + + unsigned mirrors_p1; + unsigned stripe_unit; + unsigned group_width; /* Data stripe_units without integrity comps */ + u64 group_depth; + unsigned group_count; + + unsigned max_io_size; + + unsigned comps_index; + unsigned num_comps; + /* variable length */ + struct objio_dev_ent *ods[]; +}; + +static inline struct objio_segment * +OBJIO_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, struct objio_segment, lseg); +} + +struct objio_state; +typedef ssize_t (*objio_done_fn)(struct objio_state *ios); + +struct objio_state { + /* Generic layer */ + struct objlayout_io_state ol_state; + + struct objio_segment *layout; + + struct kref kref; + objio_done_fn done; + void *private; + + unsigned long length; + unsigned numdevs; /* Actually used devs in this IO */ + /* A per-device variable array of size numdevs */ + struct _objio_per_comp { + struct bio *bio; + struct osd_request *or; + unsigned long length; + u64 offset; + unsigned dev; + } per_dev[]; +}; + +/* Send and wait for a get_device_info of devices in the layout, + then look them up with the osd_initiator library */ +static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned comp, + gfp_t gfp_flags) +{ + struct pnfs_osd_deviceaddr *deviceaddr; + struct nfs4_deviceid *d_id; + struct objio_dev_ent *ode; + struct osd_dev *od; + struct osd_dev_info odi; + int err; + + d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; + + ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); + if (ode) + return ode; + + err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); + if (unlikely(err)) { + dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", + __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); + return ERR_PTR(err); + } + + odi.systemid_len = deviceaddr->oda_systemid.len; + if (odi.systemid_len > sizeof(odi.systemid)) { + err = -EINVAL; + goto out; + } else if (odi.systemid_len) + memcpy(odi.systemid, deviceaddr->oda_systemid.data, + odi.systemid_len); + odi.osdname_len = deviceaddr->oda_osdname.len; + odi.osdname = (u8 *)deviceaddr->oda_osdname.data; + + if (!odi.osdname_len && !odi.systemid_len) { + dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", + __func__); + err = -ENODEV; + goto out; + } + + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + err = PTR_ERR(od); + dprintk("%s: osduld_info_lookup => %d\n", __func__, err); + goto out; + } + + ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, + gfp_flags); + +out: + dprintk("%s: return=%d\n", __func__, err); + objlayout_put_deviceinfo(deviceaddr); + return err ? ERR_PTR(err) : ode; +} + +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, + gfp_t gfp_flags) +{ + unsigned i; + int err; + + /* lookup all devices */ + for (i = 0; i < objio_seg->num_comps; i++) { + struct objio_dev_ent *ode; + + ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); + if (unlikely(IS_ERR(ode))) { + err = PTR_ERR(ode); + goto out; + } + objio_seg->ods[i] = ode; + } + err = 0; + +out: + dprintk("%s: return=%d\n", __func__, err); + return err; +} + +static int _verify_data_map(struct pnfs_osd_layout *layout) +{ + struct pnfs_osd_data_map *data_map = &layout->olo_map; + u64 stripe_length; + u32 group_width; + +/* FIXME: Only raid0 for now. if not go through MDS */ + if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { + printk(KERN_ERR "Only RAID_0 for now\n"); + return -ENOTSUPP; + } + if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { + printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", + data_map->odm_num_comps, data_map->odm_mirror_cnt); + return -EINVAL; + } + + if (data_map->odm_group_width) + group_width = data_map->odm_group_width; + else + group_width = data_map->odm_num_comps / + (data_map->odm_mirror_cnt + 1); + + stripe_length = (u64)data_map->odm_stripe_unit * group_width; + if (stripe_length >= (1ULL << 32)) { + printk(KERN_ERR "Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -ENOTSUPP; + } + + if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { + printk(KERN_ERR "Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(data_map->odm_stripe_unit), PAGE_SIZE); + return -ENOTSUPP; + } + + return 0; +} + +static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, + struct pnfs_osd_object_cred *src_comp, + struct caps_buffers *caps_p) +{ + WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); + WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); + + *cur_comp = *src_comp; + + memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, + sizeof(caps_p->caps_key)); + cur_comp->oc_cap_key.cred = caps_p->caps_key; + + memcpy(caps_p->creds, src_comp->oc_cap.cred, + sizeof(caps_p->creds)); + cur_comp->oc_cap.cred = caps_p->creds; +} + +int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags) +{ + struct objio_segment *objio_seg; + struct pnfs_osd_xdr_decode_layout_iter iter; + struct pnfs_osd_layout layout; + struct pnfs_osd_object_cred *cur_comp, src_comp; + struct caps_buffers *caps_p; + int err; + + err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); + if (unlikely(err)) + return err; + + err = _verify_data_map(&layout); + if (unlikely(err)) + return err; + + objio_seg = kzalloc(sizeof(*objio_seg) + + sizeof(objio_seg->ods[0]) * layout.olo_num_comps + + sizeof(*objio_seg->comps) * layout.olo_num_comps + + sizeof(struct caps_buffers) * layout.olo_num_comps, + gfp_flags); + if (!objio_seg) + return -ENOMEM; + + objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); + cur_comp = objio_seg->comps; + caps_p = (void *)(cur_comp + layout.olo_num_comps); + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) + copy_single_comp(cur_comp++, &src_comp, caps_p++); + if (unlikely(err)) + goto err; + + objio_seg->num_comps = layout.olo_num_comps; + objio_seg->comps_index = layout.olo_comps_index; + err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); + if (err) + goto err; + + objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; + if (layout.olo_map.odm_group_width) { + objio_seg->group_width = layout.olo_map.odm_group_width; + objio_seg->group_depth = layout.olo_map.odm_group_depth; + objio_seg->group_count = layout.olo_map.odm_num_comps / + objio_seg->mirrors_p1 / + objio_seg->group_width; + } else { + objio_seg->group_width = layout.olo_map.odm_num_comps / + objio_seg->mirrors_p1; + objio_seg->group_depth = -1; + objio_seg->group_count = 1; + } + + /* Cache this calculation it will hit for every page */ + objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - + objio_seg->stripe_unit) * + objio_seg->group_width; + + *outp = &objio_seg->lseg; + return 0; + +err: + kfree(objio_seg); + dprintk("%s: Error: return %d\n", __func__, err); + *outp = NULL; + return err; +} + +void objio_free_lseg(struct pnfs_layout_segment *lseg) +{ + int i; + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + + for (i = 0; i < objio_seg->num_comps; i++) { + if (!objio_seg->ods[i]) + break; + nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + } + kfree(objio_seg); +} + +int objio_alloc_io_state(struct pnfs_layout_segment *lseg, + struct objlayout_io_state **outp, + gfp_t gfp_flags) +{ + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + struct objio_state *ios; + const unsigned first_size = sizeof(*ios) + + objio_seg->num_comps * sizeof(ios->per_dev[0]); + const unsigned sec_size = objio_seg->num_comps * + sizeof(ios->ol_state.ioerrs[0]); + + ios = kzalloc(first_size + sec_size, gfp_flags); + if (unlikely(!ios)) + return -ENOMEM; + + ios->layout = objio_seg; + ios->ol_state.ioerrs = ((void *)ios) + first_size; + ios->ol_state.num_comps = objio_seg->num_comps; + + *outp = &ios->ol_state; + return 0; +} + +void objio_free_io_state(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + + kfree(ios); +} + +enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) +{ + switch (oep) { + case OSD_ERR_PRI_NO_ERROR: + return (enum pnfs_osd_errno)0; + + case OSD_ERR_PRI_CLEAR_PAGES: + BUG_ON(1); + return 0; + + case OSD_ERR_PRI_RESOURCE: + return PNFS_OSD_ERR_RESOURCE; + case OSD_ERR_PRI_BAD_CRED: + return PNFS_OSD_ERR_BAD_CRED; + case OSD_ERR_PRI_NO_ACCESS: + return PNFS_OSD_ERR_NO_ACCESS; + case OSD_ERR_PRI_UNREACHABLE: + return PNFS_OSD_ERR_UNREACHABLE; + case OSD_ERR_PRI_NOT_FOUND: + return PNFS_OSD_ERR_NOT_FOUND; + case OSD_ERR_PRI_NO_SPACE: + return PNFS_OSD_ERR_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case OSD_ERR_PRI_EIO: + return PNFS_OSD_ERR_EIO; + } +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +static int _io_check(struct objio_state *ios, bool is_write) +{ + enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; + int lin_ret = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct osd_request *or = ios->per_dev[i].or; + unsigned dev; + int ret; + + if (!or) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + BUG_ON(is_write); + _clear_bio(ios->per_dev[i].bio); + dprintk("%s: start read offset passed end of file " + "offset=0x%llx, length=0x%lx\n", __func__, + _LLU(ios->per_dev[i].offset), + ios->per_dev[i].length); + + continue; /* we recovered */ + } + dev = ios->per_dev[i].dev; + objlayout_io_set_result(&ios->ol_state, dev, + &ios->layout->comps[dev].oc_object_id, + osd_pri_2_pnfs_err(osi.osd_err_pri), + ios->per_dev[i].offset, + ios->per_dev[i].length, + is_write); + + if (osi.osd_err_pri >= oep) { + oep = osi.osd_err_pri; + lin_ret = ret; + } + } + + return lin_ret; +} + +/* + * Common IO state helpers. + */ +static void _io_free(struct objio_state *ios) +{ + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct _objio_per_comp *per_dev = &ios->per_dev[i]; + + if (per_dev->or) { + osd_end_request(per_dev->or); + per_dev->or = NULL; + } + + if (per_dev->bio) { + bio_put(per_dev->bio); + per_dev->bio = NULL; + } + } +} + +struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) +{ + unsigned min_dev = ios->layout->comps_index; + unsigned max_dev = min_dev + ios->layout->num_comps; + + BUG_ON(dev < min_dev || max_dev <= dev); + return ios->layout->ods[dev - min_dev]->od; +} + +struct _striping_info { + u64 obj_offset; + u64 group_length; + unsigned dev; + unsigned unit_off; +}; + +static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, + struct _striping_info *si) +{ + u32 stripe_unit = ios->layout->stripe_unit; + u32 group_width = ios->layout->group_width; + u64 group_depth = ios->layout->group_depth; + u32 U = stripe_unit * group_width; + + u64 T = U * group_depth; + u64 S = T * ios->layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodU = file_offset - M * S; + u32 G = div64_u64(LmodU, T); + u64 H = LmodU - G * T; + + u32 N = div_u64(H, U); + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; + si->dev *= ios->layout->mirrors_p1; + + si->group_length = T - H; +} + +static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, + unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, + gfp_t gfp_flags) +{ + unsigned pg = *cur_pg; + struct request_queue *q = + osd_request_queue(_io_od(ios, per_dev->dev)); + + per_dev->length += cur_len; + + if (per_dev->bio == NULL) { + unsigned stripes = ios->layout->num_comps / + ios->layout->mirrors_p1; + unsigned pages_in_stripe = stripes * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / + stripes; + + if (BIO_MAX_PAGES_KMALLOC < bio_size) + bio_size = BIO_MAX_PAGES_KMALLOC; + + per_dev->bio = bio_kmalloc(gfp_flags, bio_size); + if (unlikely(!per_dev->bio)) { + dprintk("Faild to allocate BIO size=%u\n", bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + BUG_ON(ios->ol_state.nr_pages <= pg); + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, + ios->ol_state.pages[pg], pglen, pgbase); + if (unlikely(pglen != added_len)) + return -ENOMEM; + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + *cur_pg = pg; + return 0; +} + +static int _prepare_one_group(struct objio_state *ios, u64 length, + struct _striping_info *si, unsigned *last_pg, + gfp_t gfp_flags) +{ + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = *last_pg; + int ret = 0; + + while (length) { + struct _objio_per_comp *per_dev = &ios->per_dev[dev]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length) { + per_dev->dev = dev; + if (dev < si->dev) { + per_dev->offset = si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } else if (dev == si->dev) { + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && + (page_off != ios->ol_state.pgbase)); + } else { /* dev > si->dev */ + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } + + if (max_comp < dev) + max_comp = dev; + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len, gfp_flags); + if (unlikely(ret)) + goto out; + + dev += mirrors_p1; + dev = (dev % devs_in_group) + first_dev; + + length -= cur_len; + ios->length += cur_len; + } +out: + ios->numdevs = max_comp + mirrors_p1; + *last_pg = cur_pg; + return ret; +} + +static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) +{ + u64 length = ios->ol_state.count; + u64 offset = ios->ol_state.offset; + struct _striping_info si; + unsigned last_pg = 0; + int ret = 0; + + while (length) { + _calc_stripe_info(ios, offset, &si); + + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); + if (unlikely(ret)) + goto out; + + offset += si.group_length; + length -= si.group_length; + } + +out: + if (!ios->length) + return ret; + + return 0; +} + +static ssize_t _sync_done(struct objio_state *ios) +{ + struct completion *waiting = ios->private; + + complete(waiting); + return 0; +} + +static void _last_io(struct kref *kref) +{ + struct objio_state *ios = container_of(kref, struct objio_state, kref); + + ios->done(ios); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct objio_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +static ssize_t _io_exec(struct objio_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + ssize_t status = 0; /* sync status */ + unsigned i; + objio_done_fn saved_done_fn = ios->done; + bool sync = ios->ol_state.sync; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + + if (!or) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + + if (sync) { + wait_for_completion(&wait); + status = saved_done_fn(ios); + } + + return status; +} + +/* + * read + */ +static ssize_t _read_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, false); + + _io_free(ios); + + if (likely(!ret)) + status = ios->length; + else + status = ret; + + objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct osd_request *or = NULL; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + unsigned dev = per_dev->dev; + struct pnfs_osd_object_cred *cred = + &ios->layout->comps[dev]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + int ret; + + or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + +err: + return ret; +} + +static ssize_t _read_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _read_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _read_done; + return _io_exec(ios); /* In sync mode exec returns the io status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + ret = _io_rw_pagelist(ios, GFP_KERNEL); + if (unlikely(ret)) + return ret; + + return _read_exec(ios); +} + +/* + * write + */ +static ssize_t _write_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, true); + + _io_free(ios); + + if (likely(!ret)) { + /* FIXME: should be based on the OSD's persistence model + * See OSD2r05 Section 4.13 Data persistence model */ + ios->ol_state.committed = NFS_FILE_SYNC; + status = ios->length; + } else { + status = ret; + } + + objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret; + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct osd_request *or = NULL; + struct pnfs_osd_object_cred *cred = + &ios->layout->comps[dev]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + struct bio *bio; + + or = osd_start_request(_io_od(ios, dev), GFP_NOFS); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + if (per_dev != master_dev) { + bio = bio_kmalloc(GFP_NOFS, + master_dev->bio->bi_max_vecs); + if (unlikely(!bio)) { + dprintk("Faild to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto err; + } + + __bio_clone(bio, master_dev->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->bio = bio; + per_dev->dev = dev; + per_dev->length = master_dev->length; + per_dev->offset = master_dev->offset; + } else { + bio = master_dev->bio; + bio->bi_rw |= REQ_WRITE; + } + + osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + } + +err: + return ret; +} + +static ssize_t _write_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _write_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _write_done; + return _io_exec(ios); /* In sync mode exec returns the io->status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + /* TODO: ios->stable = stable; */ + ret = _io_rw_pagelist(ios, GFP_NOFS); + if (unlikely(ret)) + return ret; + + return _write_exec(ios); +} + +static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) +{ + if (!pnfs_generic_pg_test(pgio, prev, req)) + return false; + + return pgio->pg_count + req->wb_bytes <= + OBJIO_LSEG(pgio->pg_lseg)->max_io_size; +} + +static struct pnfs_layoutdriver_type objlayout_type = { + .id = LAYOUT_OSD2_OBJECTS, + .name = "LAYOUT_OSD2_OBJECTS", + .flags = PNFS_LAYOUTRET_ON_SETATTR, + + .alloc_layout_hdr = objlayout_alloc_layout_hdr, + .free_layout_hdr = objlayout_free_layout_hdr, + + .alloc_lseg = objlayout_alloc_lseg, + .free_lseg = objlayout_free_lseg, + + .read_pagelist = objlayout_read_pagelist, + .write_pagelist = objlayout_write_pagelist, + .pg_test = objio_pg_test, + + .free_deviceid_node = objio_free_deviceid_node, + + .encode_layoutcommit = objlayout_encode_layoutcommit, + .encode_layoutreturn = objlayout_encode_layoutreturn, +}; + +MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); +MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>"); +MODULE_LICENSE("GPL"); + +static int __init +objlayout_init(void) +{ + int ret = pnfs_register_layoutdriver(&objlayout_type); + + if (ret) + printk(KERN_INFO + "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", + __func__, ret); + else + printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", + __func__); + return ret; +} + +static void __exit +objlayout_exit(void) +{ + pnfs_unregister_layoutdriver(&objlayout_type); + printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", + __func__); +} + +module_init(objlayout_init); +module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c new file mode 100644 index 000000000000..dc3956c0de80 --- /dev/null +++ b/fs/nfs/objlayout/objlayout.c @@ -0,0 +1,712 @@ +/* + * pNFS Objects layout driver high level definitions + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <scsi/osd_initiator.h> +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* + * Create a objlayout layout structure for the given inode and return it. + */ +struct pnfs_layout_hdr * +objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct objlayout *objlay; + + objlay = kzalloc(sizeof(struct objlayout), gfp_flags); + if (objlay) { + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); + } + dprintk("%s: Return %p\n", __func__, objlay); + return &objlay->pnfs_layout; +} + +/* + * Free an objlayout layout structure + */ +void +objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct objlayout *objlay = OBJLAYOUT(lo); + + dprintk("%s: objlay %p\n", __func__, objlay); + + WARN_ON(!list_empty(&objlay->err_list)); + kfree(objlay); +} + +/* + * Unmarshall layout and store it in pnfslay. + */ +struct pnfs_layout_segment * +objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + int status = -ENOMEM; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + struct pnfs_layout_segment *lseg; + + dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); + + scratch = alloc_page(gfp_flags); + if (!scratch) + goto err_nofree; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); + if (unlikely(status)) { + dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, + status); + goto err; + } + + __free_page(scratch); + + dprintk("%s: Return %p\n", __func__, lseg); + return lseg; + +err: + __free_page(scratch); +err_nofree: + dprintk("%s: Err Return=>%d\n", __func__, status); + return ERR_PTR(status); +} + +/* + * Free a layout segement + */ +void +objlayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s: freeing layout segment %p\n", __func__, lseg); + + if (unlikely(!lseg)) + return; + + objio_free_lseg(lseg); +} + +/* + * I/O Operations + */ +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1 : NFS4_MAX_UINT64; +} + +static struct objlayout_io_state * +objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, + struct page **pages, + unsigned pgbase, + loff_t offset, + size_t count, + struct pnfs_layout_segment *lseg, + void *rpcdata, + gfp_t gfp_flags) +{ + struct objlayout_io_state *state; + u64 lseg_end_offset; + + dprintk("%s: allocating io_state\n", __func__); + if (objio_alloc_io_state(lseg, &state, gfp_flags)) + return NULL; + + BUG_ON(offset < lseg->pls_range.offset); + lseg_end_offset = end_offset(lseg->pls_range.offset, + lseg->pls_range.length); + BUG_ON(offset >= lseg_end_offset); + if (offset + count > lseg_end_offset) { + count = lseg->pls_range.length - + (offset - lseg->pls_range.offset); + dprintk("%s: truncated count %Zd\n", __func__, count); + } + + if (pgbase > PAGE_SIZE) { + pages += pgbase >> PAGE_SHIFT; + pgbase &= ~PAGE_MASK; + } + + INIT_LIST_HEAD(&state->err_list); + state->lseg = lseg; + state->rpcdata = rpcdata; + state->pages = pages; + state->pgbase = pgbase; + state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; + state->offset = offset; + state->count = count; + state->sync = 0; + + return state; +} + +static void +objlayout_free_io_state(struct objlayout_io_state *state) +{ + dprintk("%s: freeing io_state\n", __func__); + if (unlikely(!state)) + return; + + objio_free_io_state(state); +} + +/* + * I/O done common code + */ +static void +objlayout_iodone(struct objlayout_io_state *state) +{ + dprintk("%s: state %p status\n", __func__, state); + + if (likely(state->status >= 0)) { + objlayout_free_io_state(state); + } else { + struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + + spin_lock(&objlay->lock); + objlay->delta_space_valid = OBJ_DSU_INVALID; + list_add(&objlay->err_list, &state->err_list); + spin_unlock(&objlay->lock); + } +} + +/* + * objlayout_io_set_result - Set an osd_error code on a specific osd comp. + * + * The @index component IO failed (error returned from target). Register + * the error for later reporting at layout-return. + */ +void +objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, + struct pnfs_osd_objid *pooid, int osd_error, + u64 offset, u64 length, bool is_write) +{ + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + + BUG_ON(index >= state->num_comps); + if (osd_error) { + ioerr->oer_component = *pooid; + ioerr->oer_comp_offset = offset; + ioerr->oer_comp_length = length; + ioerr->oer_iswrite = is_write; + ioerr->oer_errno = osd_error; + + dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " + "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, index, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + } else { + /* User need not call if no error is reported */ + ioerr->oer_errno = 0; + } +} + +/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_read_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + + pnfs_ld_read_done(rdata); +} + +void +objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +{ + int eof = state->eof; + struct nfs_read_data *rdata; + + state->status = status; + dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); + rdata = state->rpcdata; + rdata->task.tk_status = status; + if (status >= 0) { + rdata->res.count = status; + rdata->res.eof = eof; + } + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_ld_read_done(rdata); + else { + INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); + schedule_work(&rdata->task.u.tk_work); + } +} + +/* + * Perform sync or async reads. + */ +enum pnfs_try_status +objlayout_read_pagelist(struct nfs_read_data *rdata) +{ + loff_t offset = rdata->args.offset; + size_t count = rdata->args.count; + struct objlayout_io_state *state; + ssize_t status = 0; + loff_t eof; + + dprintk("%s: Begin inode %p offset %llu count %d\n", + __func__, rdata->inode, offset, (int)count); + + eof = i_size_read(rdata->inode); + if (unlikely(offset + count > eof)) { + if (offset >= eof) { + status = 0; + rdata->res.count = 0; + rdata->res.eof = 1; + goto out; + } + count = eof - offset; + } + + state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, + rdata->args.pages, rdata->args.pgbase, + offset, count, + rdata->lseg, rdata, + GFP_KERNEL); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->eof = state->offset + state->count >= eof; + + status = objio_read_pagelist(state); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + rdata->pnfs_error = status; + return PNFS_ATTEMPTED; +} + +/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_write_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + + pnfs_ld_write_done(wdata); +} + +void +objlayout_write_done(struct objlayout_io_state *state, ssize_t status, + bool sync) +{ + struct nfs_write_data *wdata; + + dprintk("%s: Begin\n", __func__); + wdata = state->rpcdata; + state->status = status; + wdata->task.tk_status = status; + if (status >= 0) { + wdata->res.count = status; + wdata->verf.committed = state->committed; + dprintk("%s: Return status %d committed %d\n", + __func__, wdata->task.tk_status, + wdata->verf.committed); + } else + dprintk("%s: Return status %d\n", + __func__, wdata->task.tk_status); + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_ld_write_done(wdata); + else { + INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); + schedule_work(&wdata->task.u.tk_work); + } +} + +/* + * Perform sync or async writes. + */ +enum pnfs_try_status +objlayout_write_pagelist(struct nfs_write_data *wdata, + int how) +{ + struct objlayout_io_state *state; + ssize_t status; + + dprintk("%s: Begin inode %p offset %llu count %u\n", + __func__, wdata->inode, wdata->args.offset, wdata->args.count); + + state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, + wdata->args.pages, + wdata->args.pgbase, + wdata->args.offset, + wdata->args.count, + wdata->lseg, wdata, + GFP_NOFS); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->sync = how & FLUSH_SYNC; + + status = objio_write_pagelist(state, how & FLUSH_STABLE); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + wdata->pnfs_error = status; + return PNFS_ATTEMPTED; +} + +void +objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct pnfs_osd_layoutupdate lou; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + + spin_lock(&objlay->lock); + lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); + lou.dsu_delta = objlay->delta_space_used; + objlay->delta_space_used = 0; + objlay->delta_space_valid = OBJ_DSU_INIT; + lou.olu_ioerr_flag = !list_empty(&objlay->err_list); + spin_unlock(&objlay->lock); + + start = xdr_reserve_space(xdr, 4); + + BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + + dprintk("%s: Return delta_space_used %lld err %d\n", __func__, + lou.dsu_delta, lou.olu_ioerr_flag); +} + +static int +err_prio(u32 oer_errno) +{ + switch (oer_errno) { + case 0: + return 0; + + case PNFS_OSD_ERR_RESOURCE: + return OSD_ERR_PRI_RESOURCE; + case PNFS_OSD_ERR_BAD_CRED: + return OSD_ERR_PRI_BAD_CRED; + case PNFS_OSD_ERR_NO_ACCESS: + return OSD_ERR_PRI_NO_ACCESS; + case PNFS_OSD_ERR_UNREACHABLE: + return OSD_ERR_PRI_UNREACHABLE; + case PNFS_OSD_ERR_NOT_FOUND: + return OSD_ERR_PRI_NOT_FOUND; + case PNFS_OSD_ERR_NO_SPACE: + return OSD_ERR_PRI_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case PNFS_OSD_ERR_EIO: + return OSD_ERR_PRI_EIO; + } +} + +static void +merge_ioerr(struct pnfs_osd_ioerr *dest_err, + const struct pnfs_osd_ioerr *src_err) +{ + u64 dest_end, src_end; + + if (!dest_err->oer_errno) { + *dest_err = *src_err; + /* accumulated device must be blank */ + memset(&dest_err->oer_component.oid_device_id, 0, + sizeof(dest_err->oer_component.oid_device_id)); + + return; + } + + if (dest_err->oer_component.oid_partition_id != + src_err->oer_component.oid_partition_id) + dest_err->oer_component.oid_partition_id = 0; + + if (dest_err->oer_component.oid_object_id != + src_err->oer_component.oid_object_id) + dest_err->oer_component.oid_object_id = 0; + + if (dest_err->oer_comp_offset > src_err->oer_comp_offset) + dest_err->oer_comp_offset = src_err->oer_comp_offset; + + dest_end = end_offset(dest_err->oer_comp_offset, + dest_err->oer_comp_length); + src_end = end_offset(src_err->oer_comp_offset, + src_err->oer_comp_length); + if (dest_end < src_end) + dest_end = src_end; + + dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; + + if ((src_err->oer_iswrite == dest_err->oer_iswrite) && + (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { + dest_err->oer_errno = src_err->oer_errno; + } else if (src_err->oer_iswrite) { + dest_err->oer_iswrite = true; + dest_err->oer_errno = src_err->oer_errno; + } +} + +static void +encode_accumulated_error(struct objlayout *objlay, __be32 *p) +{ + struct objlayout_io_state *state, *tmp; + struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + unsigned i; + + for (i = 0; i < state->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + merge_ioerr(&accumulated_err, ioerr); + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } + + pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); +} + +void +objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct objlayout_io_state *state, *tmp; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + spin_lock(&objlay->lock); + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + __be32 *last_xdr = NULL, *p; + unsigned i; + int res = 0; + + for (i = 0; i < state->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + dprintk("%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + p = pnfs_osd_xdr_ioerr_reserve_space(xdr); + if (unlikely(!p)) { + res = -E2BIG; + break; /* accumulated_error */ + } + + last_xdr = p; + pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + } + + /* TODO: use xdr_write_pages */ + if (unlikely(res)) { + /* no space for even one error descriptor */ + BUG_ON(!last_xdr); + + /* we've encountered a situation with lots and lots of + * errors and no space to encode them all. Use the last + * available slot to report the union of all the + * remaining errors. + */ + encode_accumulated_error(objlay, last_xdr); + goto loop_done; + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } +loop_done: + spin_unlock(&objlay->lock); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + + +/* + * Get Device Info API for io engines + */ +struct objlayout_deviceinfo { + struct page *page; + struct pnfs_osd_deviceaddr da; /* This must be last */ +}; + +/* Initialize and call nfs_getdeviceinfo, then decode and return a + * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() + * should be called. + */ +int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags) +{ + struct objlayout_deviceinfo *odi; + struct pnfs_device pd; + struct super_block *sb; + struct page *page, **pages; + u32 *p; + int err; + + page = alloc_page(gfp_flags); + if (!page) + return -ENOMEM; + + pages = &page; + pd.pages = pages; + + memcpy(&pd.dev_id, d_id, sizeof(*d_id)); + pd.layout_type = LAYOUT_OSD2_OBJECTS; + pd.pages = &page; + pd.pgbase = 0; + pd.pglen = PAGE_SIZE; + pd.mincount = 0; + + sb = pnfslay->plh_inode->i_sb; + err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); + dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); + if (err) + goto err_out; + + p = page_address(page); + odi = kzalloc(sizeof(*odi), gfp_flags); + if (!odi) { + err = -ENOMEM; + goto err_out; + } + pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); + odi->page = page; + *deviceaddr = &odi->da; + return 0; + +err_out: + __free_page(page); + return err; +} + +void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) +{ + struct objlayout_deviceinfo *odi = container_of(deviceaddr, + struct objlayout_deviceinfo, + da); + + __free_page(odi->page); + kfree(odi); +} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h new file mode 100644 index 000000000000..a8244c8e042d --- /dev/null +++ b/fs/nfs/objlayout/objlayout.h @@ -0,0 +1,187 @@ +/* + * Data types and function declerations for interfacing with the + * pNFS standard object layout driver. + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OBJLAYOUT_H +#define _OBJLAYOUT_H + +#include <linux/nfs_fs.h> +#include <linux/pnfs_osd_xdr.h> +#include "../pnfs.h" + +/* + * per-inode layout + */ +struct objlayout { + struct pnfs_layout_hdr pnfs_layout; + + /* for layout_commit */ + enum osd_delta_space_valid_enum { + OBJ_DSU_INIT = 0, + OBJ_DSU_VALID, + OBJ_DSU_INVALID, + } delta_space_valid; + s64 delta_space_used; /* consumed by write ops */ + + /* for layout_return */ + spinlock_t lock; + struct list_head err_list; +}; + +static inline struct objlayout * +OBJLAYOUT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct objlayout, pnfs_layout); +} + +/* + * per-I/O operation state + * embedded in objects provider io_state data structure + */ +struct objlayout_io_state { + struct pnfs_layout_segment *lseg; + + struct page **pages; + unsigned pgbase; + unsigned nr_pages; + unsigned long count; + loff_t offset; + bool sync; + + void *rpcdata; + int status; /* res */ + int eof; /* res */ + int committed; /* res */ + + /* Error reporting (layout_return) */ + struct list_head err_list; + unsigned num_comps; + /* Pointer to array of error descriptors of size num_comps. + * It should contain as many entries as devices in the osd_layout + * that participate in the I/O. It is up to the io_engine to allocate + * needed space and set num_comps. + */ + struct pnfs_osd_ioerr *ioerrs; +}; + +/* + * Raid engine I/O API + */ +extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags); +extern void objio_free_lseg(struct pnfs_layout_segment *lseg); + +extern int objio_alloc_io_state( + struct pnfs_layout_segment *lseg, + struct objlayout_io_state **outp, + gfp_t gfp_flags); +extern void objio_free_io_state(struct objlayout_io_state *state); + +extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); +extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, + bool stable); + +/* + * callback API + */ +extern void objlayout_io_set_result(struct objlayout_io_state *state, + unsigned index, struct pnfs_osd_objid *pooid, + int osd_error, u64 offset, u64 length, bool is_write); + +static inline void +objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +{ + struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + + /* If one of the I/Os errored out and the delta_space_used was + * invalid we render the complete report as invalid. Protocol mandate + * the DSU be accurate or not reported. + */ + spin_lock(&objlay->lock); + if (objlay->delta_space_valid != OBJ_DSU_INVALID) { + objlay->delta_space_valid = OBJ_DSU_VALID; + objlay->delta_space_used += space_used; + } + spin_unlock(&objlay->lock); +} + +extern void objlayout_read_done(struct objlayout_io_state *state, + ssize_t status, bool sync); +extern void objlayout_write_done(struct objlayout_io_state *state, + ssize_t status, bool sync); + +extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags); +extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); + +/* + * exported generic objects function vectors + */ + +extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); +extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); + +extern struct pnfs_layout_segment *objlayout_alloc_lseg( + struct pnfs_layout_hdr *, + struct nfs4_layoutget_res *, + gfp_t gfp_flags); +extern void objlayout_free_lseg(struct pnfs_layout_segment *); + +extern enum pnfs_try_status objlayout_read_pagelist( + struct nfs_read_data *); + +extern enum pnfs_try_status objlayout_write_pagelist( + struct nfs_write_data *, + int how); + +extern void objlayout_encode_layoutcommit( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutcommit_args *); + +extern void objlayout_encode_layoutreturn( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutreturn_args *); + +#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c new file mode 100644 index 000000000000..16fc758e9123 --- /dev/null +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -0,0 +1,412 @@ +/* + * Object-Based pNFS Layout XDR layer + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/pnfs_osd_xdr.h> + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * The following implementation is based on RFC5664 + */ + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static __be32 * +_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) +{ + p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, + sizeof(objid->oid_device_id.data)); + + p = xdr_decode_hyper(p, &objid->oid_partition_id); + p = xdr_decode_hyper(p, &objid->oid_object_id); + return p; +} +/* + * struct pnfs_osd_opaque_cred { + * u32 cred_len; + * void *cred; + * }; // xdr size [variable] + * The return pointers are from the xdr buffer + */ +static int +_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 1); + + if (!p) + return -EINVAL; + + opaque_cred->cred_len = be32_to_cpu(*p++); + + p = xdr_inline_decode(xdr, opaque_cred->cred_len); + if (!p) + return -EINVAL; + + opaque_cred->cred = p; + return 0; +} + +/* + * struct pnfs_osd_object_cred { + * struct pnfs_osd_objid oc_object_id; + * u32 oc_osd_version; + * u32 oc_cap_key_sec; + * struct pnfs_osd_opaque_cred oc_cap_key + * struct pnfs_osd_opaque_cred oc_cap; + * }; // xdr size 32 + 4 + 4 + [variable] + [variable] + */ +static int +_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); + int ret; + + if (!p) + return -EIO; + + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p); + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); + if (unlikely(ret)) + return ret; + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); + return ret; +} + +/* + * struct pnfs_osd_data_map { + * u32 odm_num_comps; + * u64 odm_stripe_unit; + * u32 odm_group_width; + * u32 odm_group_depth; + * u32 odm_mirror_cnt; + * u32 odm_raid_algorithm; + * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 + */ +static inline int +_osd_data_map_xdr_sz(void) +{ + return 4 + 8 + 4 + 4 + 4 + 4; +} + +static __be32 * +_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) +{ + data_map->odm_num_comps = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); + data_map->odm_group_width = be32_to_cpup(p++); + data_map->odm_group_depth = be32_to_cpup(p++); + data_map->odm_mirror_cnt = be32_to_cpup(p++); + data_map->odm_raid_algorithm = be32_to_cpup(p++); + dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " + "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", + __func__, + data_map->odm_num_comps, + (unsigned long long)data_map->odm_stripe_unit, + data_map->odm_group_width, + data_map->odm_group_depth, + data_map->odm_mirror_cnt, + data_map->odm_raid_algorithm); + return p; +} + +int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) +{ + __be32 *p; + + memset(iter, 0, sizeof(*iter)); + + p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); + if (unlikely(!p)) + return -EINVAL; + + p = _osd_xdr_decode_data_map(p, &layout->olo_map); + layout->olo_comps_index = be32_to_cpup(p++); + layout->olo_num_comps = be32_to_cpup(p++); + iter->total_comps = layout->olo_num_comps; + return 0; +} + +bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, + int *err) +{ + BUG_ON(iter->decoded_comps > iter->total_comps); + if (iter->decoded_comps == iter->total_comps) + return false; + + *err = _osd_xdr_decode_object_cred(comp, xdr); + if (unlikely(*err)) { + dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " + "total_comps=%d\n", __func__, *err, + iter->decoded_comps, iter->total_comps); + return false; /* stop the loop */ + } + dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " + "key_len=%u cap_len=%u\n", + __func__, + _DEVID_LO(&comp->oc_object_id.oid_device_id), + _DEVID_HI(&comp->oc_object_id.oid_device_id), + comp->oc_object_id.oid_partition_id, + comp->oc_object_id.oid_object_id, + comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); + + iter->decoded_comps++; + return true; +} + +/* + * Get Device Information Decoding + * + * Note: since Device Information is currently done synchronously, all + * variable strings fields are left inside the rpc buffer and are only + * pointed to by the pnfs_osd_deviceaddr members. So the read buffer + * should not be freed while the returned information is in use. + */ +/* + *struct nfs4_string { + * unsigned int len; + * char *data; + *}; // size [variable] + * NOTE: Returned string points to inside the XDR buffer + */ +static __be32 * +__read_u8_opaque(__be32 *p, struct nfs4_string *str) +{ + str->len = be32_to_cpup(p++); + str->data = (char *)p; + + p += XDR_QUADLEN(str->len); + return p; +} + +/* + * struct pnfs_osd_targetid { + * u32 oti_type; + * struct nfs4_string oti_scsi_device_id; + * };// size 4 + [variable] + */ +static __be32 * +__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) +{ + u32 oti_type; + + oti_type = be32_to_cpup(p++); + targetid->oti_type = oti_type; + + switch (oti_type) { + case OBJ_TARGET_SCSI_NAME: + case OBJ_TARGET_SCSI_DEVICE_ID: + p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); + } + + return p; +} + +/* + * struct pnfs_osd_net_addr { + * struct nfs4_string r_netid; + * struct nfs4_string r_addr; + * }; + */ +static __be32 * +__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) +{ + p = __read_u8_opaque(p, &netaddr->r_netid); + p = __read_u8_opaque(p, &netaddr->r_addr); + + return p; +} + +/* + * struct pnfs_osd_targetaddr { + * u32 ota_available; + * struct pnfs_osd_net_addr ota_netaddr; + * }; + */ +static __be32 * +__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) +{ + u32 ota_available; + + ota_available = be32_to_cpup(p++); + targetaddr->ota_available = ota_available; + + if (ota_available) + p = __read_net_addr(p, &targetaddr->ota_netaddr); + + + return p; +} + +/* + * struct pnfs_osd_deviceaddr { + * struct pnfs_osd_targetid oda_targetid; + * struct pnfs_osd_targetaddr oda_targetaddr; + * u8 oda_lun[8]; + * struct nfs4_string oda_systemid; + * struct pnfs_osd_object_cred oda_root_obj_cred; + * struct nfs4_string oda_osdname; + * }; + */ + +/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does + * not have an xdr_stream + */ +static __be32 * +__read_opaque_cred(__be32 *p, + struct pnfs_osd_opaque_cred *opaque_cred) +{ + opaque_cred->cred_len = be32_to_cpu(*p++); + opaque_cred->cred = p; + return p + XDR_QUADLEN(opaque_cred->cred_len); +} + +static __be32 * +__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) +{ + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p++); + + p = __read_opaque_cred(p, &comp->oc_cap_key); + p = __read_opaque_cred(p, &comp->oc_cap); + return p; +} + +void pnfs_osd_xdr_decode_deviceaddr( + struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) +{ + p = __read_targetid(p, &deviceaddr->oda_targetid); + + p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); + + p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, + sizeof(deviceaddr->oda_lun)); + + p = __read_u8_opaque(p, &deviceaddr->oda_systemid); + + p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); + + p = __read_u8_opaque(p, &deviceaddr->oda_osdname); + + /* libosd likes this terminated in dbg. It's last, so no problems */ + deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; +} + +/* + * struct pnfs_osd_layoutupdate { + * u32 dsu_valid; + * s64 dsu_delta; + * u32 olu_ioerr_flag; + * }; xdr size 4 + 8 + 4 + */ +int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, + struct pnfs_osd_layoutupdate *lou) +{ + __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); + + if (!p) + return -E2BIG; + + *p++ = cpu_to_be32(lou->dsu_valid); + if (lou->dsu_valid) + p = xdr_encode_hyper(p, lou->dsu_delta); + *p++ = cpu_to_be32(lou->olu_ioerr_flag); + return 0; +} + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static inline __be32 * +pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) +{ + p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, + sizeof(object_id->oid_device_id.data)); + p = xdr_encode_hyper(p, object_id->oid_partition_id); + p = xdr_encode_hyper(p, object_id->oid_object_id); + + return p; +} + +/* + * struct pnfs_osd_ioerr { + * struct pnfs_osd_objid oer_component; + * u64 oer_comp_offset; + * u64 oer_comp_length; + * u32 oer_iswrite; + * u32 oer_errno; + * }; // xdr size 32 + 24 bytes + */ +void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) +{ + p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); + p = xdr_encode_hyper(p, ioerr->oer_comp_offset); + p = xdr_encode_hyper(p, ioerr->oer_comp_length); + *p++ = cpu_to_be32(ioerr->oer_iswrite); + *p = cpu_to_be32(ioerr->oer_errno); +} + +__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 32 + 24); + if (unlikely(!p)) + dprintk("%s: out of xdr space\n", __func__); + + return p; +} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index c80add6e2213..7913961aff22 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req) TASK_UNINTERRUPTIBLE); } +static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +{ + /* + * FIXME: ideally we should be able to coalesce all requests + * that are not block boundary aligned, but currently this + * is problematic for the case of bsize < PAGE_CACHE_SIZE, + * since nfs_flush_multi and nfs_pagein_multi assume you + * can have only one struct nfs_page. + */ + if (desc->pg_bsize < PAGE_SIZE) + return 0; + + return desc->pg_count + req->wb_bytes <= desc->pg_bsize; +} + /** * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor @@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; + desc->pg_test = nfs_generic_pg_test; + pnfs_pageio_init(desc, inode); } /** @@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, * * Return 'true' if this is the case, else return 'false'. */ -static int nfs_can_coalesce_requests(struct nfs_page *prev, - struct nfs_page *req, - struct nfs_pageio_descriptor *pgio) +static bool nfs_can_coalesce_requests(struct nfs_page *prev, + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) { if (req->wb_context->cred != prev->wb_context->cred) - return 0; + return false; if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) - return 0; + return false; if (req->wb_context->state != prev->wb_context->state) - return 0; + return false; if (req->wb_index != (prev->wb_index + 1)) - return 0; + return false; if (req->wb_pgbase != 0) - return 0; + return false; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) - return 0; - /* - * Non-whole file layouts need to check that req is inside of - * pgio->pg_lseg. - */ - if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) - return 0; - return 1; + return false; + return pgio->pg_test(pgio, prev, req); } /** @@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { - size_t newlen = req->wb_bytes; - if (desc->pg_count != 0) { struct nfs_page *prev; - /* - * FIXME: ideally we should be able to coalesce all requests - * that are not block boundary aligned, but currently this - * is problematic for the case of bsize < PAGE_CACHE_SIZE, - * since nfs_flush_multi and nfs_pagein_multi assume you - * can have only one struct nfs_page. - */ - if (desc->pg_bsize < PAGE_SIZE) - return 0; - newlen += desc->pg_count; - if (newlen > desc->pg_bsize) - return 0; prev = nfs_list_entry(desc->pg_list.prev); if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; - } else + } else { desc->pg_base = req->wb_pgbase; + } nfs_list_remove_request(req); nfs_list_add_request(req, &desc->pg_list); - desc->pg_count = newlen; + desc->pg_count += req->wb_bytes; return 1; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f57f5281a520..8c1309d852a6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo) atomic_inc(&lo->plh_refcount); } +static struct pnfs_layout_hdr * +pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; + return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) : + kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); +} + +static void +pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; + return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); +} + static void destroy_layout_hdr(struct pnfs_layout_hdr *lo) { dprintk("%s: freeing layout cache %p\n", __func__, lo); BUG_ON(!list_empty(&lo->plh_layouts)); NFS_I(lo->plh_inode)->layout = NULL; - kfree(lo); + pnfs_free_layout_hdr(lo); } static void @@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg) { struct inode *inode = lseg->pls_layout->plh_inode; - BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); if (list_empty(&lseg->pls_layout->plh_segs)) { set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); @@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(put_lseg); +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1 : NFS4_MAX_UINT64; +} + +/* + * is l2 fully contained in l1? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline int +lo_seg_contained(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (start1 <= start2) && (end1 >= end2); +} + +/* + * is l1 and l2 intersecting? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline int +lo_seg_intersecting(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (end1 == NFS4_MAX_UINT64 || end1 > start2) && + (end2 == NFS4_MAX_UINT64 || end2 > start1); +} + static bool -should_free_lseg(u32 lseg_iomode, u32 recall_iomode) +should_free_lseg(struct pnfs_layout_range *lseg_range, + struct pnfs_layout_range *recall_range) { - return (recall_iomode == IOMODE_ANY || - lseg_iomode == recall_iomode); + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + lo_seg_intersecting(lseg_range, recall_range); } /* Returns 1 if lseg is removed from list, 0 otherwise */ @@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - u32 iomode) + struct pnfs_layout_range *recall_range) { struct pnfs_layout_segment *lseg, *next; int invalid = 0, removed = 0; @@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return 0; } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (should_free_lseg(lseg->pls_range.iomode, iomode)) { + if (!recall_range || + should_free_lseg(&lseg->pls_range, recall_range)) { dprintk("%s: freeing lseg %p iomode %d " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, @@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) lo = nfsi->layout; if (lo) { lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); + mark_matching_lsegs_invalid(lo, &tmp_list, NULL); } spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); @@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, - u32 iomode, + struct pnfs_layout_range *range, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; @@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo, goto out_err_free; } - lgp->args.minlength = NFS4_MAX_UINT64; + lgp->args.minlength = PAGE_CACHE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - lgp->args.range.iomode = iomode; - lgp->args.range.offset = 0; - lgp->args.range.length = NFS4_MAX_UINT64; + lgp->args.range = *range; lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); @@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, nfs4_proc_layoutget(lgp); if (!lseg) { /* remember that LAYOUTGET failed and suspend trying */ - set_bit(lo_fail_bit(iomode), &lo->plh_flags); + set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); } /* free xdr pages */ @@ -542,6 +619,51 @@ out_err_free: return NULL; } +/* Initiates a LAYOUTRETURN(FILE) */ +int +_pnfs_return_layout(struct inode *ino) +{ + struct pnfs_layout_hdr *lo = NULL; + struct nfs_inode *nfsi = NFS_I(ino); + LIST_HEAD(tmp_list); + struct nfs4_layoutreturn *lrp; + nfs4_stateid stateid; + int status = 0; + + dprintk("--> %s\n", __func__); + + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) { + spin_unlock(&ino->i_lock); + dprintk("%s: no layout segments to return\n", __func__); + goto out; + } + stateid = nfsi->layout->plh_stateid; + /* Reference matched in nfs4_layoutreturn_release */ + get_layout_hdr(lo); + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + + WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); + + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); + if (unlikely(lrp == NULL)) { + status = -ENOMEM; + goto out; + } + + lrp->args.stateid = stateid; + lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; + lrp->args.inode = ino; + lrp->clp = NFS_SERVER(ino)->nfs_client; + + status = nfs4_proc_layoutreturn(lrp); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} + bool pnfs_roc(struct inode *ino) { struct pnfs_layout_hdr *lo; @@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier) * are seen first. */ static s64 -cmp_layout(u32 iomode1, u32 iomode2) +cmp_layout(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) { + s64 d; + + /* high offset > low offset */ + d = l1->offset - l2->offset; + if (d) + return d; + + /* short length > long length */ + d = l2->length - l1->length; + if (d) + return d; + /* read > read/write */ - return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); + return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); } static void @@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { struct pnfs_layout_segment *lp; - int found = 0; dprintk("%s:Begin\n", __func__); assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry(lp, &lo->plh_segs, pls_list) { - if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) + if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) continue; list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " @@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, lseg->pls_range.offset, lseg->pls_range.length, lp, lp->pls_range.iomode, lp->pls_range.offset, lp->pls_range.length); - found = 1; - break; - } - if (!found) { - list_add_tail(&lseg->pls_list, &lo->plh_segs); - dprintk("%s: inserted lseg %p " - "iomode %d offset %llu length %llu at tail\n", - __func__, lseg, lseg->pls_range.iomode, - lseg->pls_range.offset, lseg->pls_range.length); + goto out; } + list_add_tail(&lseg->pls_list, &lo->plh_segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length); +out: get_layout_hdr(lo); dprintk("%s:Return\n", __func__); @@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) { struct pnfs_layout_hdr *lo; - lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); + lo = pnfs_alloc_layout_hdr(ino, gfp_flags); if (!lo) return NULL; atomic_set(&lo->plh_refcount, 1); @@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) if (likely(nfsi->layout == NULL)) /* Won the race? */ nfsi->layout = new; else - kfree(new); + pnfs_free_layout_hdr(new); return nfsi->layout; } @@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) * READ RW true */ static int -is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) +is_matching_lseg(struct pnfs_layout_range *ls_range, + struct pnfs_layout_range *range) { - return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); + struct pnfs_layout_range range1; + + if ((range->iomode == IOMODE_RW && + ls_range->iomode != IOMODE_RW) || + !lo_seg_intersecting(ls_range, range)) + return 0; + + /* range1 covers only the first byte in the range */ + range1 = *range; + range1.length = 1; + return lo_seg_contained(ls_range, &range1); } /* * lookup range in layout */ static struct pnfs_layout_segment * -pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) +pnfs_find_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) { struct pnfs_layout_segment *lseg, *ret = NULL; @@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && - is_matching_lseg(lseg, iomode)) { + is_matching_lseg(&lseg->pls_range, range)) { ret = get_lseg(lseg); break; } - if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) + if (cmp_layout(range, &lseg->pls_range) > 0) break; } @@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + loff_t pos, + u64 count, enum pnfs_iomode iomode, gfp_t gfp_flags) { + struct pnfs_layout_range arg = { + .iomode = iomode, + .offset = pos, + .length = count, + }; + unsigned pg_offset; struct nfs_inode *nfsi = NFS_I(ino); struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; struct pnfs_layout_hdr *lo; @@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; /* Check to see if the layout for the given range already exists */ - lseg = pnfs_find_lseg(lo, iomode); + lseg = pnfs_find_lseg(lo, &arg); if (lseg) goto out_unlock; @@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino, spin_unlock(&clp->cl_lock); } - lseg = send_layoutget(lo, ctx, iomode, gfp_flags); + pg_offset = arg.offset & ~PAGE_CACHE_MASK; + if (pg_offset) { + arg.offset -= pg_offset; + arg.length += pg_offset; + } + arg.length = PAGE_CACHE_ALIGN(arg.length); + + lseg = send_layoutget(lo, ctx, &arg, gfp_flags); if (!lseg && first) { spin_lock(&clp->cl_lock); list_del_init(&lo->plh_layouts); @@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; int status = 0; - /* Verify we got what we asked for. - * Note that because the xdr parsing only accepts a single - * element array, this can fail even if the server is behaving - * correctly. - */ - if (lgp->args.range.iomode > res->range.iomode || - res->range.offset != 0 || - res->range.length != NFS4_MAX_UINT64) { - status = -EINVAL; - goto out; - } /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); if (!lseg || IS_ERR(lseg)) { @@ -895,51 +1043,64 @@ out_forget_reply: goto out; } -static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, - struct nfs_page *prev, - struct nfs_page *req) +bool +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) { + enum pnfs_iomode access_type; + gfp_t gfp_flags; + + /* We assume that pg_ioflags == 0 iff we're reading a page */ + if (pgio->pg_ioflags == 0) { + access_type = IOMODE_READ; + gfp_flags = GFP_KERNEL; + } else { + access_type = IOMODE_RW; + gfp_flags = GFP_NOFS; + } + if (pgio->pg_count == prev->wb_bytes) { /* This is first coelesce call for a series of nfs_pages */ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, prev->wb_context, - IOMODE_READ, - GFP_KERNEL); + req_offset(req), + pgio->pg_count, + access_type, + gfp_flags); + return true; } - return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); -} -void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) -{ - struct pnfs_layoutdriver_type *ld; + if (pgio->pg_lseg && + req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length)) + return false; - ld = NFS_SERVER(inode)->pnfs_curr_ld; - pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL; + return true; } +EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); -static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, - struct nfs_page *prev, - struct nfs_page *req) +/* + * Called by non rpc-based layout drivers + */ +int +pnfs_ld_write_done(struct nfs_write_data *data) { - if (pgio->pg_count == prev->wb_bytes) { - /* This is first coelesce call for a series of nfs_pages */ - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - prev->wb_context, - IOMODE_RW, - GFP_NOFS); - } - return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); -} + int status; -void -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) -{ - struct pnfs_layoutdriver_type *ld; + if (!data->pnfs_error) { + pnfs_set_layoutcommit(data); + data->mds_ops->rpc_call_done(&data->task, data); + data->mds_ops->rpc_release(data); + return 0; + } - ld = NFS_SERVER(inode)->pnfs_curr_ld; - pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, + data->pnfs_error); + status = nfs_initiate_write(data, NFS_CLIENT(data->inode), + data->mds_ops, NFS_FILE_SYNC); + return status ? : -EAGAIN; } +EXPORT_SYMBOL_GPL(pnfs_ld_write_done); enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *wdata, @@ -966,6 +1127,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, } /* + * Called by non rpc-based layout drivers + */ +int +pnfs_ld_read_done(struct nfs_read_data *data) +{ + int status; + + if (!data->pnfs_error) { + __nfs4_read_done_cb(data); + data->mds_ops->rpc_call_done(&data->task, data); + data->mds_ops->rpc_release(data); + return 0; + } + + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, + data->pnfs_error); + status = nfs_initiate_read(data, NFS_CLIENT(data->inode), + data->mds_ops); + return status ? : -EAGAIN; +} +EXPORT_SYMBOL_GPL(pnfs_ld_read_done); + +/* * Call the appropriate parallel I/O subsystem read function. */ enum pnfs_try_status @@ -1009,7 +1193,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata) { struct nfs_inode *nfsi = NFS_I(wdata->inode); - loff_t end_pos = wdata->args.offset + wdata->res.count; + loff_t end_pos = wdata->mds_offset + wdata->res.count; bool mark_as_dirty = false; spin_lock(&nfsi->vfs_inode.i_lock); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 0c015bad9e7a..48d0a8e4d062 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,6 +30,7 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +#include <linux/nfs_fs.h> #include <linux/nfs_page.h> enum { @@ -64,17 +65,29 @@ enum { NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ }; +enum layoutdriver_policy_flags { + /* Should the pNFS client commit and return the layout upon a setattr */ + PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, +}; + +struct nfs4_deviceid_node; + /* Per-layout driver specific registration structure */ struct pnfs_layoutdriver_type { struct list_head pnfs_tblid; const u32 id; const char *name; struct module *owner; + unsigned flags; + + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); + void (*free_layout_hdr) (struct pnfs_layout_hdr *); + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); /* test for nfs page cache coalescing */ - int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); /* Returns true if layoutdriver wants to divert this request to * driver's commit routine. @@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type { */ enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); + + void (*free_deviceid_node) (struct nfs4_deviceid_node *); + + void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args); + + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args); }; struct pnfs_layout_hdr { @@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev); extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ void get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - enum pnfs_iomode access_type, gfp_t gfp_flags); + loff_t pos, u64 count, enum pnfs_iomode access_type, + gfp_t gfp_flags); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, const struct rpc_call_ops *, int); enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, const struct rpc_call_ops *); -void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); -void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *); +bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct nfs4_state *open_state); int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - u32 iomode); + struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier); void pnfs_set_layoutcommit(struct nfs_write_data *wdata); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int _pnfs_return_layout(struct inode *); +int pnfs_ld_write_done(struct nfs_write_data *); +int pnfs_ld_read_done(struct nfs_read_data *); + +/* pnfs_dev.c */ +struct nfs4_deviceid_node { + struct hlist_node node; + const struct pnfs_layoutdriver_type *ld; + const struct nfs_client *nfs_client; + struct nfs4_deviceid deviceid; + atomic_t ref; +}; + +void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); +struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, + const struct pnfs_layoutdriver_type *, + const struct nfs_client *, + const struct nfs4_deviceid *); +struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); +bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_deviceid_purge_client(const struct nfs_client *); static inline int lo_fail_bit(u32 iomode) { @@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req) put_lseg(req->wb_commit_lseg); } +/* Should the pNFS client commit and return the layout upon a setattr */ +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_SETATTR; +} + +static inline int pnfs_return_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_server *nfss = NFS_SERVER(ino); + + if (pnfs_enabled_sb(nfss) && nfsi->layout) + return _pnfs_return_layout(ino); + + return 0; +} + +static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld) + pgio->pg_test = ld->pg_test; +} + #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) static inline struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - enum pnfs_iomode access_type, gfp_t gfp_flags) + loff_t pos, u64 count, enum pnfs_iomode access_type, + gfp_t gfp_flags) { return NULL; } @@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data, return PNFS_NOT_ATTEMPTED; } +static inline int pnfs_return_layout(struct inode *ino) +{ + return 0; +} + +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + return false; +} + static inline bool pnfs_roc(struct inode *ino) { @@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) -{ - pgio->pg_test = NULL; -} - -static inline void -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino) +static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, + struct inode *inode) { - pgio->pg_test = NULL; } static inline void @@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { return 0; } + +static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl) +{ +} #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c new file mode 100644 index 000000000000..c65e133ce9c0 --- /dev/null +++ b/fs/nfs/pnfs_dev.c @@ -0,0 +1,270 @@ +/* + * Device operations for the pnfs client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * Garth Goodson <Garth.Goodson@netapp.com> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* + * Device ID RCU cache. A device ID is unique per server and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfs4_deviceid_lock); + +void +nfs4_print_deviceid(const struct nfs4_deviceid *id) +{ + u32 *p = (u32 *)id; + + dprintk("%s: device id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); +} +EXPORT_SYMBOL_GPL(nfs4_print_deviceid); + +static inline u32 +nfs4_deviceid_hash(const struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +static struct nfs4_deviceid_node * +_lookup_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id, + long hash) +{ + struct nfs4_deviceid_node *d; + struct hlist_node *n; + + hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + if (d->ld == ld && d->nfs_client == clp && + !memcmp(&d->deviceid, id, sizeof(*id))) { + if (atomic_read(&d->ref)) + return d; + else + continue; + } + return NULL; +} + +/* + * Lookup a deviceid in cache and get a reference count on it if found + * + * @clp nfs_client associated with deviceid + * @id deviceid to look up + */ +struct nfs4_deviceid_node * +_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id, + long hash) +{ + struct nfs4_deviceid_node *d; + + rcu_read_lock(); + d = _lookup_deviceid(ld, clp, id, hash); + if (d && !atomic_inc_not_zero(&d->ref)) + d = NULL; + rcu_read_unlock(); + return d; +} + +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); +} +EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); + +/* + * Unhash and put deviceid + * + * @clp nfs_client associated with deviceid + * @id the deviceid to unhash + * + * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. + */ +struct nfs4_deviceid_node * +nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + struct nfs4_deviceid_node *d; + + spin_lock(&nfs4_deviceid_lock); + rcu_read_lock(); + d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); + rcu_read_unlock(); + if (!d) { + spin_unlock(&nfs4_deviceid_lock); + return NULL; + } + hlist_del_init_rcu(&d->node); + spin_unlock(&nfs4_deviceid_lock); + synchronize_rcu(); + + /* balance the initial ref set in pnfs_insert_deviceid */ + if (atomic_dec_and_test(&d->ref)) + return d; + + return NULL; +} +EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid); + +/* + * Delete a deviceid from cache + * + * @clp struct nfs_client qualifying the deviceid + * @id deviceid to delete + */ +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + struct nfs4_deviceid_node *d; + + d = nfs4_unhash_put_deviceid(ld, clp, id); + if (!d) + return; + d->ld->free_deviceid_node(d); +} +EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); + +void +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, + const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *nfs_client, + const struct nfs4_deviceid *id) +{ + INIT_HLIST_NODE(&d->node); + d->ld = ld; + d->nfs_client = nfs_client; + d->deviceid = *id; + atomic_set(&d->ref, 1); +} +EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); + +/* + * Uniquely initialize and insert a deviceid node into cache + * + * @new new deviceid node + * Note that the caller must set up the following members: + * new->ld + * new->nfs_client + * new->deviceid + * + * @ret the inserted node, if none found, otherwise, the found entry. + */ +struct nfs4_deviceid_node * +nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) +{ + struct nfs4_deviceid_node *d; + long hash; + + spin_lock(&nfs4_deviceid_lock); + hash = nfs4_deviceid_hash(&new->deviceid); + d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash); + if (d) { + spin_unlock(&nfs4_deviceid_lock); + return d; + } + + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + spin_unlock(&nfs4_deviceid_lock); + + return new; +} +EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); + +/* + * Dereference a deviceid node and delete it when its reference count drops + * to zero. + * + * @d deviceid node to put + * + * @ret true iff the node was deleted + */ +bool +nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) +{ + if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) + return false; + hlist_del_init_rcu(&d->node); + spin_unlock(&nfs4_deviceid_lock); + synchronize_rcu(); + d->ld->free_deviceid_node(d); + return true; +} +EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); + +static void +_deviceid_purge_client(const struct nfs_client *clp, long hash) +{ + struct nfs4_deviceid_node *d; + struct hlist_node *n, *next; + HLIST_HEAD(tmp); + + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + if (d->nfs_client == clp && atomic_read(&d->ref)) { + hlist_del_init_rcu(&d->node); + hlist_add_head(&d->node, &tmp); + } + rcu_read_unlock(); + + if (hlist_empty(&tmp)) + return; + + synchronize_rcu(); + hlist_for_each_entry_safe(d, n, next, &tmp, node) + if (atomic_dec_and_test(&d->ref)) + d->ld->free_deviceid_node(d); +} + +void +nfs4_deviceid_purge_client(const struct nfs_client *clp) +{ + long h; + + spin_lock(&nfs4_deviceid_lock); + for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) + _deviceid_purge_client(clp, h); + spin_unlock(&nfs4_deviceid_lock); +} diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 2bcf0dc306a1..20a7f952e244 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) atomic_set(&req->wb_complete, requests); BUG_ON(desc->pg_lseg != NULL); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_READ, GFP_KERNEL); ClearPageError(page); offset = 0; nbytes = desc->pg_count; @@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_READ, GFP_KERNEL); ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 0, lseg); @@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - pnfs_pageio_init_read(&pgio, inode); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e288f06d3fa7..ce40e5c568ba 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -63,6 +63,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } +#ifdef CONFIG_NFS_V4_1 +void show_sessions(struct seq_file *m, struct nfs_server *server) +{ + if (nfs4_has_session(server->nfs_client)) + seq_printf(m, ",sessions"); +} +#else +void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif + +#ifdef CONFIG_NFS_V4_1 +void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ + seq_printf(m, ",pnfs="); + if (server->pnfs_curr_ld) + seq_printf(m, "%s", server->pnfs_curr_ld->name); + else + seq_printf(m, "not configured"); +} +#else /* CONFIG_NFS_V4_1 */ +void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#endif /* CONFIG_NFS_V4_1 */ static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) { @@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + show_sessions(m, nfss); + show_pnfs(m, nfss); } #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 49c715b4ac92..e268e3b23497 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) atomic_set(&req->wb_complete, requests); BUG_ON(desc->pg_lseg); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_RW, GFP_NOFS); ClearPageError(page); offset = 0; nbytes = desc->pg_count; @@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, + req_offset(req), desc->pg_count, + IOMODE_RW, GFP_NOFS); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) @@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, { size_t wsize = NFS_SERVER(inode)->wsize; - pnfs_pageio_init_write(pgio, inode); - if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); else diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ad000aeb21a2..b9566e46219f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); - if (rv) - goto out; - rv = check_nfsd_access(exp, rqstp); - if (rv) - fh_put(fhp); -out: exp_put(exp); return rv; } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 2247fc91d5e9..9095f3c21df9 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, } /* Now create the file and set attributes */ - nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len, + nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, attr, newfhp, argp->createmode, argp->verf, NULL, NULL); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index ad48faca20fc..08c6e36ab2eb 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -842,7 +842,7 @@ out: return rv; } -__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) { struct svc_fh fh; int err; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5fcb1396a7e3..3a6dbd70b34b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o /* * Note: create modes (UNCHECKED,GUARDED...) are the same - * in NFSv4 as in v3. + * in NFSv4 as in v3 except EXCLUSIVE4_1. */ - status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data, + status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, &resfh, open->op_createmode, (u32 *)open->op_verf.data, @@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, putfh->pf_fhlen); - return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS); } static __be32 @@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 err; fh_init(&resfh, NFS4_FHSIZE); + err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, secinfo->si_name, secinfo->si_namelen, &exp, &dentry); @@ -986,6 +989,9 @@ enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ + /* For rfc 5661 section 2.6.3.1.1: */ + OP_HANDLES_WRONGSEC = 1 << 3, + OP_IS_PUTFH_LIKE = 1 << 4, }; struct nfsd4_operation { @@ -1031,6 +1037,44 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) return nfs_ok; } +static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) +{ + return &nfsd4_ops[op->opnum]; +} + +static bool need_wrongsec_check(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *next = &argp->ops[resp->opcnt]; + struct nfsd4_operation *thisd; + struct nfsd4_operation *nextd; + + thisd = OPDESC(this); + /* + * Most ops check wronsec on our own; only the putfh-like ops + * have special rules. + */ + if (!(thisd->op_flags & OP_IS_PUTFH_LIKE)) + return false; + /* + * rfc 5661 2.6.3.1.1.6: don't bother erroring out a + * put-filehandle operation if we're not going to use the + * result: + */ + if (argp->opcnt == resp->opcnt) + return false; + + nextd = OPDESC(next); + /* + * Rest of 2.6.3.1.1: certain operations will return WRONGSEC + * errors themselves as necessary; others should check for them + * now: + */ + return !(nextd->op_flags & OP_HANDLES_WRONGSEC); +} + /* * COMPOUND call. */ @@ -1108,7 +1152,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } - opdesc = &nfsd4_ops[op->opnum]; + opdesc = OPDESC(op); if (!cstate->current_fh.fh_dentry) { if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { @@ -1126,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, else BUG_ON(op->status == nfs_ok); + if (!op->status && need_wrongsec_check(rqstp)) + op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + encode_op: /* Only from SEQUENCE */ if (resp->cstate.status == nfserr_replay_cache) { @@ -1217,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_LOOKUP", }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_LOOKUPP", }, [OP_NVERIFY] = { @@ -1229,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_OPEN", }, [OP_OPEN_CONFIRM] = { @@ -1241,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_PUTFH", }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_PUTPUBFH", }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_PUTROOTFH", }, [OP_READ] = { @@ -1281,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE, .op_name = "OP_RESTOREFH", }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SAVEFH", }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO", }, [OP_SETATTR] = { @@ -1353,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_SECINFO_NO_NAME] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", }, }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4cf04e11c66c..e98f3c2e9492 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1519,6 +1519,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, bool confirm_me = false; int status = 0; + if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) + return nfserr_inval; + nfs4_lock_state(); unconf = find_unconfirmed_client(&cr_ses->clientid); conf = find_confirmed_client(&cr_ses->clientid); @@ -1637,8 +1640,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, return nfserr_badsession; status = nfsd4_map_bcts_dir(&bcts->dir); - nfsd4_new_conn(rqstp, cstate->session, bcts->dir); - return nfs_ok; + if (!status) + nfsd4_new_conn(rqstp, cstate->session, bcts->dir); + return status; } static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) @@ -1725,6 +1729,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi return; } +static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + return args->opcnt > session->se_fchannel.maxops; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1753,6 +1764,10 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (!session) goto out; + status = nfserr_too_many_ops; + if (nfsd4_session_too_many_ops(rqstp, session)) + goto out; + status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) goto out; @@ -1808,6 +1823,8 @@ out: __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) { + int status = 0; + if (rc->rca_one_fs) { if (!cstate->current_fh.fh_dentry) return nfserr_nofilehandle; @@ -1817,9 +1834,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta */ return nfs_ok; } + nfs4_lock_state(); - if (is_client_expired(cstate->session->se_client)) { - nfs4_unlock_state(); + status = nfserr_complete_already; + if (cstate->session->se_client->cl_firststate) + goto out; + + status = nfserr_stale_clientid; + if (is_client_expired(cstate->session->se_client)) /* * The following error isn't really legal. * But we only get here if the client just explicitly @@ -1827,11 +1849,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta * error it gets back on an operation for the dead * client. */ - return nfserr_stale_clientid; - } + goto out; + + status = nfs_ok; nfsd4_create_clid_dir(cstate->session->se_client); +out: nfs4_unlock_state(); - return nfs_ok; + return status; } __be32 @@ -2462,7 +2486,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid) return NULL; } -int share_access_to_flags(u32 share_access) +static int share_access_to_flags(u32 share_access) { share_access &= ~NFS4_SHARE_WANT_MASK; @@ -2882,7 +2906,7 @@ out: return status; } -struct lock_manager nfsd4_manager = { +static struct lock_manager nfsd4_manager = { }; static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c6766af00d98..990181103214 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) { DECODE_HEAD; - u32 dummy; READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); READ32(bcts->dir); - /* XXX: Perhaps Tom Tucker could help us figure out how we - * should be using ctsa_use_conn_in_rdma_mode: */ - READ32(dummy); - + /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker + * could help us figure out we should be using it. */ DECODE_TAIL; } @@ -588,8 +585,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) READ_BUF(lockt->lt_owner.len); READMEM(lockt->lt_owner.data, lockt->lt_owner.len); - if (argp->minorversion && !zero_clientid(&lockt->lt_clientid)) - return nfserr_inval; DECODE_TAIL; } @@ -3120,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, return nfserr; } -__be32 +static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_sequence *seq) { diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 55c8e63af0be..90c6aa6d5e0f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) * which clients virtually always use auth_sys for, * even while using RPCSEC_GSS for NFS. */ - if (access & NFSD_MAY_LOCK) + if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS) goto skip_pseudoflavor_check; /* * Clients may expect to be able to use auth_sys during mount, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 129f3c9f62d5..d5718273bb32 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, struct svc_export *exp; struct dentry *dparent; struct dentry *dentry; - __be32 err; int host_err; dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); - /* Obtain dentry and export. */ - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); - if (err) - return err; - dparent = fhp->fh_dentry; exp = fhp->fh_export; exp_get(exp); @@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, struct dentry *dentry; __be32 err; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); if (err) return err; @@ -877,13 +874,11 @@ static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - struct inode *inode; mm_segment_t oldfs; __be32 err; int host_err; err = nfserr_perm; - inode = file->f_path.dentry->d_inode; if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { @@ -1340,11 +1335,18 @@ out_nfserr: } #ifdef CONFIG_NFSD_V3 + +static inline int nfsd_create_is_exclusive(int createmode) +{ + return createmode == NFS3_CREATE_EXCLUSIVE + || createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + /* - * NFSv3 version of nfsd_create + * NFSv3 and NFSv4 version of nfsd_create */ __be32 -nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, +do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, struct svc_fh *resfhp, int createmode, u32 *verifier, int *truncp, int *created) @@ -1396,7 +1398,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; - if (createmode == NFS3_CREATE_EXCLUSIVE) { + if (nfsd_create_is_exclusive(createmode)) { /* solaris7 gets confused (bugid 4218508) if these have * the high bit set, so just clear the high bits. If this is * ever changed to use different attrs for storing the @@ -1437,6 +1439,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, && dchild->d_inode->i_atime.tv_sec == v_atime && dchild->d_inode->i_size == 0 ) break; + case NFS4_CREATE_EXCLUSIVE4_1: + if ( dchild->d_inode->i_mtime.tv_sec == v_mtime + && dchild->d_inode->i_atime.tv_sec == v_atime + && dchild->d_inode->i_size == 0 ) + goto set_attr; /* fallthru */ case NFS3_CREATE_GUARDED: err = nfserr_exist; @@ -1455,7 +1462,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_check_ignore_resizing(iap); - if (createmode == NFS3_CREATE_EXCLUSIVE) { + if (nfsd_create_is_exclusive(createmode)) { /* Cram the verifier into atime/mtime */ iap->ia_valid = ATTR_MTIME|ATTR_ATIME | ATTR_MTIME_SET|ATTR_ATIME_SET; @@ -2034,7 +2041,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct inode *inode = dentry->d_inode; int err; - if (acc == NFSD_MAY_NOP) + if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) return 0; #if 0 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 9a370a5e36b7..e0bbac04d1dd 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -17,10 +17,14 @@ #define NFSD_MAY_SATTR 8 #define NFSD_MAY_TRUNC 16 #define NFSD_MAY_LOCK 32 +#define NFSD_MAY_MASK 63 + +/* extra hints to permission and open routines: */ #define NFSD_MAY_OWNER_OVERRIDE 64 #define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 #define NFSD_MAY_NOT_BREAK_LEASE 512 +#define NFSD_MAY_BYPASS_GSS 1024 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) @@ -54,7 +58,7 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, int type, dev_t rdev, struct svc_fh *res); #ifdef CONFIG_NFSD_V3 __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); -__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, +__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, u32 *verifier, int *truncp, int *created); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 587f18432832..b954878ad6ce 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -917,7 +917,7 @@ int nilfs_mark_inode_dirty(struct inode *inode) * construction. This function can be called both as a single operation * and as a part of indivisible file operations. */ -void nilfs_dirty_inode(struct inode *inode) +void nilfs_dirty_inode(struct inode *inode, int flags) { struct nilfs_transaction_info ti; struct nilfs_mdt_info *mdi = NILFS_MDT(inode); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1102a5fbb744..546849b3e88f 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) struct nilfs_transaction_info ti; int err; - dentry_unhash(dentry); - err = nilfs_transaction_begin(dir->i_sb, &ti, 0); if (err) return err; @@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct nilfs_transaction_info ti; int err; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); if (unlikely(err)) return err; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index a9c6a531f80c..f02b9ad43a21 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -269,7 +269,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); extern int nilfs_mark_inode_dirty(struct inode *); -extern void nilfs_dirty_inode(struct inode *); +extern void nilfs_dirty_inode(struct inode *, int flags); int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c368360c35a1..3b8d3979e03b 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -241,11 +241,9 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry) int ret; - if (S_ISDIR(inode->i_mode)) { - dentry_unhash(dentry); - if (!omfs_dir_is_empty(inode)) - return -ENOTEMPTY; - } + if (S_ISDIR(inode->i_mode) && + !omfs_dir_is_empty(inode)) + return -ENOTEMPTY; ret = omfs_delete_entry(dentry); if (ret) @@ -382,9 +380,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, int err; if (new_inode) { - if (S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - /* overwriting existing file/dir */ err = omfs_remove(new_dir, new_dentry); if (err) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f82e762eeca2..d545e97d99c3 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -255,13 +255,7 @@ ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - struct gendisk *disk = dev_to_disk(dev); - unsigned int alignment = 0; - - if (disk->queue) - alignment = queue_limit_discard_alignment(&disk->queue->limits, - p->start_sect); - return sprintf(buf, "%u\n", alignment); + return sprintf(buf, "%u\n", p->discard_alignment); } ssize_t part_stat_show(struct device *dev, @@ -455,6 +449,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->start_sect = start; p->alignment_offset = queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/fs/proc/base.c b/fs/proc/base.c index 4ede550517a6..14def991d9dd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,6 +83,9 @@ #include <linux/pid_namespace.h> #include <linux/fs_struct.h> #include <linux/slab.h> +#ifdef CONFIG_HARDWALL +#include <asm/hardwall.h> +#endif #include "internal.h" /* NOTE: @@ -2842,6 +2845,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING INF("io", S_IRUGO, proc_tgid_io_accounting), #endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -3181,6 +3187,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING INF("io", S_IRUGO, proc_tid_io_accounting), #endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif }; static int proc_tid_base_readdir(struct file * filp, diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 76c8164d5651..118662690cdf 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) INITIALIZE_PATH(path); struct reiserfs_dir_entry de; - dentry_unhash(dentry); - /* we will be doing 2 balancings and update 2 stat data, we change quotas * of the owner of the directory and of the owner of the parent directory. * The quota structure is possibly deleted only on last iput => outside @@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned long savelink = 1; struct timespec ctime; - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) - dentry_unhash(new_dentry); - /* three balancings: (1) old name removal, (2) new name insertion and (3) maybe "save" link insertion stat data updates: (1) old directory, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b216ff6be1c9..aa91089162cb 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -568,7 +568,7 @@ static void destroy_inodecache(void) } /* we don't mark inodes dirty, we just log them */ -static void reiserfs_dirty_inode(struct inode *inode) +static void reiserfs_dirty_inode(struct inode *inode, int flags) { struct reiserfs_transaction_handle th; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 50f1abccd1cd..e8a62f41b458 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -98,7 +98,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); - dentry_unhash(dentry); error = dir->i_op->rmdir(dir, dentry); if (!error) dentry->d_inode->i_flags |= S_DEAD; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 730c56248c9b..5e1101ff276f 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -147,7 +147,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, * table[0] points to the first inode lookup table metadata block, * this should be less than lookup_table_start */ - if (!IS_ERR(table) && table[0] >= lookup_table_start) { + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index 1516a6490bfb..0ed6edbc5c71 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -90,7 +90,7 @@ __le64 *squashfs_read_fragment_index_table(struct super_block *sb, * table[0] points to the first fragment table metadata block, this * should be less than fragment_table_start */ - if (!IS_ERR(table) && table[0] >= fragment_table_start) { + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index a70858e0fb44..d38ea3dab951 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -93,7 +93,7 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb, * table[0] points to the first id lookup table metadata block, this * should be less than id_table_start */ - if (!IS_ERR(table) && table[0] >= id_table_start) { + if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6f26abee3597..7438850c62d0 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -245,7 +245,7 @@ allocate_id_index_table: msblk->id_table = NULL; goto failed_mount; } - next_table = msblk->id_table[0]; + next_table = le64_to_cpu(msblk->id_table[0]); /* Handle inode lookup table */ lookup_table_start = le64_to_cpu(sblk->lookup_table_start); @@ -261,7 +261,7 @@ allocate_id_index_table: msblk->inode_lookup_table = NULL; goto failed_mount; } - next_table = msblk->inode_lookup_table[0]; + next_table = le64_to_cpu(msblk->inode_lookup_table[0]); sb->s_export_op = &squashfs_export_ops; @@ -286,7 +286,7 @@ handle_fragments: msblk->fragment_index = NULL; goto failed_mount; } - next_table = msblk->fragment_index[0]; + next_table = le64_to_cpu(msblk->fragment_index[0]); check_directory_table: /* Sanity check directory_table */ diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index e2cc6756f3b1..e474fbcf8bde 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) struct inode *inode = dentry->d_inode; int err = -ENOTEMPTY; - dentry_unhash(dentry); - if (sysv_empty_dir(inode)) { err = sysv_unlink(dir, dentry); if (!err) { @@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, struct sysv_dir_entry * old_de; int err = -ENOENT; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - old_de = sysv_find_entry(old_dentry, &old_page); if (!old_de) goto out; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index c2b80943560d..ef5abd38f0bf 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; - dentry_unhash(dentry); - /* * Budget request settings: deletion direntry, deletion inode and * changing the parent inode. If budgeting fails, go ahead anyway @@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - /* * Budget request settings: deletion direntry, new direntry, removing * the old inode, and changing old and new parent directory inodes. diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 166951e0dcd3..3be645e012c9 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -581,6 +581,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); if (c->leb_size - wbuf->offs >= c->max_write_size) ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); @@ -759,6 +760,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); if (c->ro_error) return -EROFS; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 34b1679e6e3a..cef0460f4c54 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -669,6 +669,7 @@ out_free: out_release: release_head(c, BASEHD); + kfree(dent); out_ro: ubifs_ro_mode(c, err); if (last_reference) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index bd644bf587a8..a5422fffbd69 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -674,7 +674,7 @@ static int kill_orphans(struct ubifs_info *c) if (IS_ERR(sleb)) { if (PTR_ERR(sleb) == -EUCLEAN) sleb = ubifs_recover_leb(c, lnum, 0, - c->sbuf, 0); + c->sbuf, -1); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 731d9e2e7b50..783d8e0beb76 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, } /** - * drop_last_node - drop the last node or group of nodes. + * drop_last_group - drop the last group of nodes. * @sleb: scanned LEB information * @offs: offset of dropped nodes is returned here - * @grouped: non-zero if whole group of nodes have to be dropped * * This is a helper function for 'ubifs_recover_leb()' which drops the last - * node of the scanned LEB or the last group of nodes if @grouped is not zero. - * This function returns %1 if a node was dropped and %0 otherwise. + * group of nodes of the scanned LEB. */ -static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) +static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) { - int dropped = 0; - while (!list_empty(&sleb->nodes)) { struct ubifs_scan_node *snod; struct ubifs_ch *ch; @@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) list); ch = snod->node; if (ch->group_type != UBIFS_IN_NODE_GROUP) - return dropped; - dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs); + break; + + dbg_rcvry("dropping grouped node at %d:%d", + sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + } +} + +/** + * drop_last_node - drop the last node. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * @grouped: non-zero if whole group of nodes have to be dropped + * + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * node of the scanned LEB. + */ +static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) +{ + struct ubifs_scan_node *snod; + + if (!list_empty(&sleb->nodes)) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + + dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs); *offs = snod->offs; list_del(&snod->list); kfree(snod); sleb->nodes_cnt -= 1; - dropped = 1; - if (!grouped) - break; } - return dropped; } /** @@ -604,7 +623,8 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) * @lnum: LEB number * @offs: offset * @sbuf: LEB-sized buffer to use - * @grouped: nodes may be grouped for recovery + * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not + * belong to any journal head) * * This function does a scan of a LEB, but caters for errors that might have * been caused by the unclean unmount from which we are attempting to recover. @@ -612,13 +632,14 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) * found, and a negative error code in case of failure. */ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, - int offs, void *sbuf, int grouped) + int offs, void *sbuf, int jhead) { int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit; + int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped; struct ubifs_scan_leb *sleb; void *buf = sbuf + offs; - dbg_rcvry("%d:%d", lnum, offs); + dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped); sleb = ubifs_start_scan(c, lnum, offs, sbuf); if (IS_ERR(sleb)) @@ -635,7 +656,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, * Scan quietly until there is an error from which we cannot * recover */ - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); if (ret == SCANNED_A_NODE) { /* A valid node, and not a padding node */ struct ubifs_ch *ch = buf; @@ -695,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, * If nodes are grouped, always drop the incomplete group at * the end. */ - drop_last_node(sleb, &offs, 1); + drop_last_group(sleb, &offs); - /* - * While we are in the middle of the same min. I/O unit keep dropping - * nodes. So basically, what we want is to make sure that the last min. - * I/O unit where we saw the corruption is dropped completely with all - * the uncorrupted node which may possibly sit there. - * - * In other words, let's name the min. I/O unit where the corruption - * starts B, and the previous min. I/O unit A. The below code tries to - * deal with a situation when half of B contains valid nodes or the end - * of a valid node, and the second half of B contains corrupted data or - * garbage. This means that UBIFS had been writing to B just before the - * power cut happened. I do not know how realistic is this scenario - * that half of the min. I/O unit had been written successfully and the - * other half not, but this is possible in our 'failure mode emulation' - * infrastructure at least. - * - * So what is the problem, why we need to drop those nodes? Whey can't - * we just clean-up the second half of B by putting a padding node - * there? We can, and this works fine with one exception which was - * reproduced with power cut emulation testing and happens extremely - * rarely. The description follows, but it is worth noting that that is - * only about the GC head, so we could do this trick only if the bud - * belongs to the GC head, but it does not seem to be worth an - * additional "if" statement. - * - * So, imagine the file-system is full, we run GC which is moving valid - * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head - * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X - * and will try to continue. Imagine that LEB X is currently the - * dirtiest LEB, and the amount of used space in LEB Y is exactly the - * same as amount of free space in LEB X. - * - * And a power cut happens when nodes are moved from LEB X to LEB Y. We - * are here trying to recover LEB Y which is the GC head LEB. We find - * the min. I/O unit B as described above. Then we clean-up LEB Y by - * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function - * fails, because it cannot find a dirty LEB which could be GC'd into - * LEB Y! Even LEB X does not match because the amount of valid nodes - * there does not fit the free space in LEB Y any more! And this is - * because of the padding node which we added to LEB Y. The - * user-visible effect of this which I once observed and analysed is - * that we cannot mount the file-system with -ENOSPC error. - * - * So obviously, to make sure that situation does not happen we should - * free min. I/O unit B in LEB Y completely and the last used min. I/O - * unit in LEB Y should be A. This is basically what the below code - * tries to do. - */ - while (min_io_unit == round_down(offs, c->min_io_size) && - min_io_unit != offs && - drop_last_node(sleb, &offs, grouped)); + if (jhead == GCHD) { + /* + * If this LEB belongs to the GC head then while we are in the + * middle of the same min. I/O unit keep dropping nodes. So + * basically, what we want is to make sure that the last min. + * I/O unit where we saw the corruption is dropped completely + * with all the uncorrupted nodes which may possibly sit there. + * + * In other words, let's name the min. I/O unit where the + * corruption starts B, and the previous min. I/O unit A. The + * below code tries to deal with a situation when half of B + * contains valid nodes or the end of a valid node, and the + * second half of B contains corrupted data or garbage. This + * means that UBIFS had been writing to B just before the power + * cut happened. I do not know how realistic is this scenario + * that half of the min. I/O unit had been written successfully + * and the other half not, but this is possible in our 'failure + * mode emulation' infrastructure at least. + * + * So what is the problem, why we need to drop those nodes? Why + * can't we just clean-up the second half of B by putting a + * padding node there? We can, and this works fine with one + * exception which was reproduced with power cut emulation + * testing and happens extremely rarely. + * + * Imagine the file-system is full, we run GC which starts + * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is + * the current GC head LEB). The @c->gc_lnum is -1, which means + * that GC will retain LEB X and will try to continue. Imagine + * that LEB X is currently the dirtiest LEB, and the amount of + * used space in LEB Y is exactly the same as amount of free + * space in LEB X. + * + * And a power cut happens when nodes are moved from LEB X to + * LEB Y. We are here trying to recover LEB Y which is the GC + * head LEB. We find the min. I/O unit B as described above. + * Then we clean-up LEB Y by padding min. I/O unit. And later + * 'ubifs_rcvry_gc_commit()' function fails, because it cannot + * find a dirty LEB which could be GC'd into LEB Y! Even LEB X + * does not match because the amount of valid nodes there does + * not fit the free space in LEB Y any more! And this is + * because of the padding node which we added to LEB Y. The + * user-visible effect of this which I once observed and + * analysed is that we cannot mount the file-system with + * -ENOSPC error. + * + * So obviously, to make sure that situation does not happen we + * should free min. I/O unit B in LEB Y completely and the last + * used min. I/O unit in LEB Y should be A. This is basically + * what the below code tries to do. + */ + while (offs > min_io_unit) + drop_last_node(sleb, &offs); + } buf = sbuf + offs; len = c->leb_size - offs; @@ -881,7 +905,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, } ubifs_scan_destroy(sleb); } - return ubifs_recover_leb(c, lnum, offs, sbuf, 0); + return ubifs_recover_leb(c, lnum, offs, sbuf, -1); } /** diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 6617280d1679..5e97161ce4d3 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -557,8 +557,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) * these LEBs could possibly be written to at the power cut * time. */ - sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, - b->bud->jhead != GCHD); + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead); else sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); if (IS_ERR(sleb)) diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 46961c003236..9e1d05666fed 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,13 +277,18 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) +int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc) { + int nr = sc->nr_to_scan; int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); if (nr == 0) - return clean_zn_cnt; + /* + * Due to the way UBIFS updates the clean znode counter it may + * temporarily be negative. + */ + return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; if (!clean_zn_cnt) { /* diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 6db0bdaa9f74..b5aeb5a8ebed 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -382,7 +382,7 @@ done: end_writeback(inode); } -static void ubifs_dirty_inode(struct inode *inode) +static void ubifs_dirty_inode(struct inode *inode, int flags) { struct ubifs_inode *ui = ubifs_inode(inode); @@ -811,15 +811,18 @@ static int alloc_wbufs(struct ubifs_info *c) c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; c->jheads[i].wbuf.jhead = i; + c->jheads[i].grouped = 1; } c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; /* * Garbage Collector head likely contains long-term data and - * does not need to be synchronized by timer. + * does not need to be synchronized by timer. Also GC head nodes are + * not grouped. */ c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; c->jheads[GCHD].wbuf.no_timer = 1; + c->jheads[GCHD].grouped = 0; return 0; } @@ -1284,12 +1287,25 @@ static int mount_ubifs(struct ubifs_info *c) if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { ubifs_msg("recovery needed"); c->need_recovery = 1; - if (!c->ro_mount) { - err = ubifs_recover_inl_heads(c, c->sbuf); - if (err) - goto out_master; - } - } else if (!c->ro_mount) { + } + + if (c->need_recovery && !c->ro_mount) { + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out_master; + } + + err = ubifs_lpt_init(c, 1, !c->ro_mount); + if (err) + goto out_master; + + if (!c->ro_mount && c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out_master; + } + + if (!c->ro_mount) { /* * Set the "dirty" flag so that if we reboot uncleanly we * will notice this immediately on the next mount. @@ -1297,13 +1313,9 @@ static int mount_ubifs(struct ubifs_info *c) c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) - goto out_master; + goto out_lpt; } - err = ubifs_lpt_init(c, 1, !c->ro_mount); - if (err) - goto out_lpt; - err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; @@ -1396,12 +1408,6 @@ static int mount_ubifs(struct ubifs_info *c) } else ubifs_assert(c->lst.taken_empty_lebs > 0); - if (!c->ro_mount && c->space_fixup) { - err = ubifs_fixup_free_space(c); - if (err) - goto out_infos; - } - err = dbg_check_filesystem(c); if (err) goto out_infos; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 8119b1fd8d94..91b4213dde84 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -2876,12 +2876,13 @@ static void tnc_destroy_cnext(struct ubifs_info *c) */ void ubifs_tnc_close(struct ubifs_info *c) { - long clean_freed; - tnc_destroy_cnext(c); if (c->zroot.znode) { - clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode); - atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt); + long n; + + ubifs_destroy_tnc_subtree(c->zroot.znode); + n = atomic_long_read(&c->clean_zn_cnt); + atomic_long_sub(n, &ubifs_clean_zn_cnt); } kfree(c->gap_lebs); kfree(c->ilebs); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 93d1412a06f0..f79983d6f860 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -722,12 +722,14 @@ struct ubifs_bud { * struct ubifs_jhead - journal head. * @wbuf: head's write-buffer * @buds_list: list of bud LEBs belonging to this journal head + * @grouped: non-zero if UBIFS groups nodes when writing to this journal head * * Note, the @buds list is protected by the @c->buds_lock. */ struct ubifs_jhead { struct ubifs_wbuf wbuf; struct list_head buds_list; + unsigned int grouped:1; }; /** @@ -1614,7 +1616,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); +int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc); /* commit.c */ int ubifs_bg_thread(void *info); @@ -1742,7 +1744,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); int ubifs_recover_master_node(struct ubifs_info *c); int ubifs_write_rcvrd_mst_node(struct ubifs_info *c); struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, - int offs, void *sbuf, int grouped); + int offs, void *sbuf, int jhead); struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf); int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 4d76594c2a8f..f1dce848ef96 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; - dentry_unhash(dentry); - retval = -ENOENT; fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); if (!fi) @@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { if (ofibh.sbh != ofibh.ebh) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 953ebdfc5bf7..29309e25417f 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = dentry->d_inode; int err= -ENOTEMPTY; - dentry_unhash(dentry); - lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); @@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; - if (new_inode && S_ISDIR(new_inode->i_mode)) - dentry_unhash(new_dentry); - old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/xattr.c b/fs/xattr.c index f1ef94974dea..f060663ab70c 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -46,18 +46,22 @@ xattr_permission(struct inode *inode, const char *name, int mask) return 0; /* - * The trusted.* namespace can only be accessed by a privileged user. + * The trusted.* namespace can only be accessed by privileged users. */ - if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) - return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + if (!capable(CAP_SYS_ADMIN)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; + return 0; + } - /* In user.* namespace, only regular files and directories can have + /* + * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and - * privileged user can write attributes. + * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) - return -EPERM; + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) return -EPERM; @@ -87,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, { struct inode *inode = dentry->d_inode; int error = -EOPNOTSUPP; + int issec = !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); + if (issec) + inode->i_flags &= ~S_NOSEC; if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -95,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, security_inode_post_setxattr(dentry, name, value, size, flags); } - } else if (!strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) { + } else if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 98b9c91fcdf1..1e3a7ce804dc 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -925,7 +925,8 @@ xfs_fs_inode_init_once( */ STATIC void xfs_fs_dirty_inode( - struct inode *inode) + struct inode *inode, + int flags) { barrier(); XFS_I(inode)->i_update_core = 1; |