summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/attr.c7
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/bio.c16
-rw-r--r--fs/block_dev.c4
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c28
-rw-r--r--fs/btrfs/ctree.h24
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/disk-io.c36
-rw-r--r--fs/btrfs/extent-tree.c103
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/file.c10
-rw-r--r--fs/btrfs/free-space-cache.c70
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c263
-rw-r--r--fs/btrfs/ioctl.c26
-rw-r--r--fs/btrfs/relocation.c34
-rw-r--r--fs/btrfs/scrub.c123
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/transaction.c302
-rw-r--r--fs/btrfs/transaction.h29
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/cifs/cifsacl.c3
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/ecryptfs/crypto.c74
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h26
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c286
-rw-r--r--fs/ecryptfs/main.c84
-rw-r--r--fs/ecryptfs/super.c16
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/fat/namei_msdos.c5
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c5
-rw-r--r--fs/inode.c54
-rw-r--r--fs/jffs2/dir.c5
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/namei.c44
-rw-r--r--fs/ncpfs/dir.c15
-rw-r--r--fs/nfs/Kconfig10
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.h17
-rw-r--r--fs/nfs/callback_proc.c51
-rw-r--r--fs/nfs/callback_xdr.c96
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c14
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/inode.c11
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4filelayout.c38
-rw-r--r--fs/nfs/nfs4filelayout.h8
-rw-r--r--fs/nfs/nfs4filelayoutdev.c119
-rw-r--r--fs/nfs/nfs4proc.c107
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c132
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c1057
-rw-r--r--fs/nfs/objlayout/objlayout.c712
-rw-r--r--fs/nfs/objlayout/objlayout.h187
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c412
-rw-r--r--fs/nfs/pagelist.c62
-rw-r--r--fs/nfs/pnfs.c342
-rw-r--r--fs/nfs/pnfs.h117
-rw-r--r--fs/nfs/pnfs_dev.c270
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/write.c10
-rw-r--r--fs/nfsd/export.c6
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfs4proc.c73
-rw-r--r--fs/nfsd/nfs4state.c42
-rw-r--r--fs/nfsd/nfs4xdr.c11
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/vfs.c33
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/partitions/check.c10
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/squashfs/export.c2
-rw-r--r--fs/squashfs/fragment.c2
-rw-r--r--fs/squashfs/id.c2
-rw-r--r--fs/squashfs/super.c6
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/ubifs/dir.c5
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/journal.c1
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/ubifs/recovery.c164
-rw-r--r--fs/ubifs/replay.c3
-rw-r--r--fs/ubifs/shrinker.c9
-rw-r--r--fs/ubifs/super.c44
-rw-r--r--fs/ubifs/tnc.c9
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xattr.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
125 files changed, 4779 insertions, 1421 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d7f3e69ae29..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
{
- dentry_unhash(d);
return v9fs_remove(i, d, 1);
}
@@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct p9_fid *newdirfid;
struct p9_wstat wstat;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
P9_DPRINTK(P9_DEBUG_VFS, "\n");
retval = 0;
old_inode = old_dentry->d_inode;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 03330e2e390c..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
dentry->d_inode->i_ino,
(int)dentry->d_name.len, dentry->d_name.name);
- dentry_unhash(dentry);
-
return affs_remove_header(dentry);
}
@@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
(u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
(u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2c4e05160042..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
_enter("{%x:%u},{%s}",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
- dentry_unhash(dentry);
-
ret = -ENAMETOOLONG;
if (dentry->d_name.len >= AFSNAMEMAX)
goto error;
@@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct key *key;
int ret;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
vnode = AFS_FS_I(old_dentry->d_inode);
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/attr.c b/fs/attr.c
index 91dbe2a107f2..caf2aa521e2b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
return -EPERM;
}
+ if ((ia_valid & ATTR_MODE)) {
+ mode_t amode = attr->ia_mode;
+ /* Flag setting protected by i_mutex */
+ if (is_sxid(amode))
+ inode->i_flags &= ~S_NOSEC;
+ }
+
now = current_fs_time(inode->i_sb);
attr->ia_ctime = now;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 87d95a8cddbc..f55ae23b137e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EACCES;
- dentry_unhash(dentry);
-
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index c7d1d06b0483..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct bfs_sb_info *info;
int error = -ENOENT;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_bh = new_bh = NULL;
old_inode = old_dentry->d_inode;
if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/bio.c b/fs/bio.c
index 840a0d755248..9bfade8a609b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* @offset: vec entry offset
*
* Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block
- * device limitations. The target block device must allow bio's
- * smaller than PAGE_SIZE, so it is always possible to add a single
- * page to an empty bio. This should only be used by REQ_PC bios.
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ *
+ * This should only be used by REQ_PC bios.
*/
int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
@@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page);
* @offset: vec entry offset
*
* Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block
- * device limitations. The target block device must allow bio's
- * smaller than PAGE_SIZE, so it is always possible to add a single
- * page to an empty bio.
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
*/
int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
unsigned int offset)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1f2b19978333..1a2421f908f0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1272,8 +1272,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
* individual writeable reference is too fragile given the
* way @mode is used in blkdev_get/put().
*/
- if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
- !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+ if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
bdev->bd_write_holder = true;
disk_block_events(disk);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 93b1aa932014..52d7eca8c7bf 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -121,9 +121,6 @@ struct btrfs_inode {
*/
u64 index_cnt;
- /* the start of block group preferred for allocations. */
- u64 block_group;
-
/* the fsync log has some corner cases that mean we have to check
* directories to see if any unlinks have been done before
* the directory was logged. See tree-log.c for all the
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b0e18d986e0a..d84089349c82 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -43,8 +43,6 @@ struct btrfs_path *btrfs_alloc_path(void)
{
struct btrfs_path *path;
path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
- if (path)
- path->reada = 1;
return path;
}
@@ -1224,6 +1222,7 @@ static void reada_for_search(struct btrfs_root *root,
u64 search;
u64 target;
u64 nread = 0;
+ u64 gen;
int direction = path->reada;
struct extent_buffer *eb;
u32 nr;
@@ -1251,6 +1250,15 @@ static void reada_for_search(struct btrfs_root *root,
nritems = btrfs_header_nritems(node);
nr = slot;
while (1) {
+ if (!node->map_token) {
+ unsigned long offset = btrfs_node_key_ptr_offset(nr);
+ map_private_extent_buffer(node, offset,
+ sizeof(struct btrfs_key_ptr),
+ &node->map_token,
+ &node->kaddr,
+ &node->map_start,
+ &node->map_len, KM_USER1);
+ }
if (direction < 0) {
if (nr == 0)
break;
@@ -1268,14 +1276,23 @@ static void reada_for_search(struct btrfs_root *root,
search = btrfs_node_blockptr(node, nr);
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
- readahead_tree_block(root, search, blocksize,
- btrfs_node_ptr_generation(node, nr));
+ gen = btrfs_node_ptr_generation(node, nr);
+ if (node->map_token) {
+ unmap_extent_buffer(node, node->map_token,
+ KM_USER1);
+ node->map_token = NULL;
+ }
+ readahead_tree_block(root, search, blocksize, gen);
nread += blocksize;
}
nscan++;
if ((nread > 65536 || nscan > 32))
break;
}
+ if (node->map_token) {
+ unmap_extent_buffer(node, node->map_token, KM_USER1);
+ node->map_token = NULL;
+ }
}
/*
@@ -1648,9 +1665,6 @@ again:
}
cow_done:
BUG_ON(!cow && ins_len);
- if (level != btrfs_header_level(b))
- WARN_ON(1);
- level = btrfs_header_level(b);
p->nodes[level] = b;
if (!p->skip_locking)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 332323e19dd1..378b5b4443f3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -930,7 +930,6 @@ struct btrfs_fs_info {
* is required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
- u64 open_ioctl_trans;
unsigned long mount_opt:20;
unsigned long compress_type:4;
u64 max_inline;
@@ -947,7 +946,6 @@ struct btrfs_fs_info {
struct super_block *sb;
struct inode *btree_inode;
struct backing_dev_info bdi;
- struct mutex trans_mutex;
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
@@ -968,6 +966,7 @@ struct btrfs_fs_info {
struct rw_semaphore subvol_sem;
struct srcu_struct subvol_srcu;
+ spinlock_t trans_lock;
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
@@ -980,6 +979,7 @@ struct btrfs_fs_info {
atomic_t async_submit_draining;
atomic_t nr_async_bios;
atomic_t async_delalloc_pages;
+ atomic_t open_ioctl_trans;
/*
* this is used by the balancing code to wait for all the pending
@@ -1044,6 +1044,7 @@ struct btrfs_fs_info {
int closing;
int log_root_recovering;
int enospc_unlink;
+ int trans_no_join;
u64 total_pinned;
@@ -1065,7 +1066,6 @@ struct btrfs_fs_info {
struct reloc_control *reloc_ctl;
spinlock_t delalloc_lock;
- spinlock_t new_trans_lock;
u64 delalloc_bytes;
/* data_alloc_cluster is only used in ssd mode */
@@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2238,6 +2239,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2350,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * Get synced with close_ctree()
+ */
+ smp_mb();
+ return fs_info->closing;
+}
+
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
@@ -2512,8 +2525,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- u64 new_dirid, u64 alloc_hint);
+ struct btrfs_root *new_root, u64 new_dirid);
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2524,7 +2536,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode);
+void btrfs_dirty_inode(struct inode *inode, int flags);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 01e29503a54b..6462c29d2d37 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&head);
next = item;
+ nitems = 0;
/*
* count the number of the continuous items that we can insert in batch
@@ -1129,7 +1130,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
delayed_node = async_node->delayed_node;
root = delayed_node->root;
- trans = btrfs_join_transaction(root, 0);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
goto free_path;
@@ -1572,8 +1573,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
- btrfs_set_stack_inode_block_group(inode_item,
- BTRFS_I(inode)->block_group);
+ btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
inode->i_atime.tv_sec);
@@ -1595,7 +1595,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
struct btrfs_delayed_node *delayed_node;
- int ret;
+ int ret = 0;
delayed_node = btrfs_get_or_create_delayed_node(inode);
if (IS_ERR(delayed_node))
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 98b6a71decba..a203d363184d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1505,24 +1505,24 @@ static int transaction_kthread(void *arg)
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
- spin_lock(&root->fs_info->new_trans_lock);
+ spin_lock(&root->fs_info->trans_lock);
cur = root->fs_info->running_transaction;
if (!cur) {
- spin_unlock(&root->fs_info->new_trans_lock);
+ spin_unlock(&root->fs_info->trans_lock);
goto sleep;
}
now = get_seconds();
if (!cur->blocked &&
(now < cur->start_time || now - cur->start_time < 30)) {
- spin_unlock(&root->fs_info->new_trans_lock);
+ spin_unlock(&root->fs_info->trans_lock);
delay = HZ * 5;
goto sleep;
}
transid = cur->transid;
- spin_unlock(&root->fs_info->new_trans_lock);
+ spin_unlock(&root->fs_info->trans_lock);
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
if (transid == trans->transid) {
ret = btrfs_commit_transaction(trans, root);
@@ -1613,7 +1613,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->ordered_operations);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
- spin_lock_init(&fs_info->new_trans_lock);
+ spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
@@ -1645,6 +1645,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
+ fs_info->trans_no_join = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1709,7 +1710,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->do_barriers = 1;
- mutex_init(&fs_info->trans_mutex);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->chunk_mutex);
@@ -2479,13 +2479,13 @@ int btrfs_commit_super(struct btrfs_root *root)
down_write(&root->fs_info->cleanup_work_sem);
up_write(&root->fs_info->cleanup_work_sem);
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
/* run commit again to drop the original snapshot */
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_commit_transaction(trans, root);
@@ -3024,10 +3024,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
WARN_ON(1);
- mutex_lock(&root->fs_info->trans_mutex);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
+ spin_lock(&root->fs_info->trans_lock);
list_splice_init(&root->fs_info->trans_list, &list);
+ root->fs_info->trans_no_join = 1;
+ spin_unlock(&root->fs_info->trans_lock);
+
while (!list_empty(&list)) {
t = list_entry(list.next, struct btrfs_transaction, list);
if (!t)
@@ -3052,23 +3055,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
t->blocked = 0;
if (waitqueue_active(&root->fs_info->transaction_wait))
wake_up(&root->fs_info->transaction_wait);
- mutex_unlock(&root->fs_info->trans_mutex);
- mutex_lock(&root->fs_info->trans_mutex);
t->commit_done = 1;
if (waitqueue_active(&t->commit_wait))
wake_up(&t->commit_wait);
- mutex_unlock(&root->fs_info->trans_mutex);
-
- mutex_lock(&root->fs_info->trans_mutex);
btrfs_destroy_pending_snapshots(t);
btrfs_destroy_delalloc_inodes(root);
- spin_lock(&root->fs_info->new_trans_lock);
+ spin_lock(&root->fs_info->trans_lock);
root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->new_trans_lock);
+ spin_unlock(&root->fs_info->trans_lock);
btrfs_destroy_marked_extents(root, &t->dirty_pages,
EXTENT_DIRTY);
@@ -3082,8 +3080,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
kmem_cache_free(btrfs_transaction_cachep, t);
}
+ spin_lock(&root->fs_info->trans_lock);
+ root->fs_info->trans_no_join = 0;
+ spin_unlock(&root->fs_info->trans_lock);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
- mutex_unlock(&root->fs_info->trans_mutex);
return 0;
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 169bd62ce776..5b9b6b6df242 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -348,7 +348,7 @@ static int caching_kthread(void *data)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 2;
+ path->reada = 1;
key.objectid = last;
key.offset = 0;
@@ -366,8 +366,7 @@ again:
nritems = btrfs_header_nritems(leaf);
while (1) {
- smp_mb();
- if (fs_info->closing > 1) {
+ if (btrfs_fs_closing(fs_info) > 1) {
last = (u64)-1;
break;
}
@@ -379,15 +378,18 @@ again:
if (ret)
break;
- caching_ctl->progress = last;
- btrfs_release_path(path);
- up_read(&fs_info->extent_commit_sem);
- mutex_unlock(&caching_ctl->mutex);
- if (btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
- else
+ if (need_resched() ||
+ btrfs_next_leaf(extent_root, path)) {
+ caching_ctl->progress = last;
+ btrfs_release_path(path);
+ up_read(&fs_info->extent_commit_sem);
+ mutex_unlock(&caching_ctl->mutex);
cond_resched();
- goto again;
+ goto again;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
}
if (key.objectid < block_group->key.objectid) {
@@ -3065,7 +3067,7 @@ again:
spin_unlock(&data_sinfo->lock);
alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3091,9 +3093,10 @@ alloc:
/* commit the current transaction and try again */
commit_trans:
- if (!committed && !root->fs_info->open_ioctl_trans) {
+ if (!committed &&
+ !atomic_read(&root->fs_info->open_ioctl_trans)) {
committed = 1;
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
@@ -3472,7 +3475,7 @@ again:
goto out;
ret = -ENOSPC;
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
goto out;
ret = btrfs_commit_transaction(trans, root);
@@ -3699,7 +3702,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
if (trans)
return -EAGAIN;
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
ret = btrfs_commit_transaction(trans, root);
return 0;
@@ -3837,6 +3840,37 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
}
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
+ u64 num_bytes;
+ int ret;
+
+ /*
+ * Truncate should be freeing data, but give us 2 items just in case it
+ * needs to use some space. We may want to be smarter about this in the
+ * future.
+ */
+ num_bytes = btrfs_calc_trans_metadata_size(root, 2);
+
+ /* We already have enough bytes, just return */
+ if (rsv->reserved >= num_bytes)
+ return 0;
+
+ num_bytes -= rsv->reserved;
+
+ /*
+ * You should have reserved enough space before hand to do this, so this
+ * should not fail.
+ */
+ ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
+ BUG_ON(ret);
+
+ return 0;
+}
+
int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int num_items)
@@ -3877,23 +3911,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
/*
- * one for deleting orphan item, one for updating inode and
- * two for calling btrfs_truncate_inode_items.
- *
- * btrfs_truncate_inode_items is a delete operation, it frees
- * more space than it uses in most cases. So two units of
- * metadata space should be enough for calling it many times.
- * If all of the metadata space is used, we can commit
- * transaction and use space it freed.
+ * We need to hold space in order to delete our orphan item once we've
+ * added it, so this takes the reservation so we can release it later
+ * when we are truly done with the orphan item.
*/
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
void btrfs_orphan_release_metadata(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
@@ -4987,6 +5016,15 @@ have_block_group:
if (unlikely(block_group->ro))
goto loop;
+ spin_lock(&block_group->free_space_ctl->tree_lock);
+ if (cached &&
+ block_group->free_space_ctl->free_space <
+ num_bytes + empty_size) {
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+ goto loop;
+ }
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+
/*
* Ok we want to try and use the cluster allocator, so lets look
* there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
@@ -5150,6 +5188,7 @@ checks:
btrfs_add_free_space(block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
+ btrfs_put_block_group(block_group);
break;
loop:
failed_cluster_refill = false;
@@ -5242,14 +5281,7 @@ loop:
ret = -ENOSPC;
} else if (!ins->objectid) {
ret = -ENOSPC;
- }
-
- /* we found what we needed */
- if (ins->objectid) {
- if (!(data & BTRFS_BLOCK_GROUP_DATA))
- trans->block_group = block_group->key.objectid;
-
- btrfs_put_block_group(block_group);
+ } else if (ins->objectid) {
ret = 0;
}
@@ -6526,7 +6558,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
BUG_ON(cache->ro);
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
alloc_flags = update_block_group_flags(root, cache->flags);
@@ -6882,6 +6914,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->reada = 1;
cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
if (cache_gen != 0 &&
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c5d9fbb92bc3..7055d11c1efd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1476,7 +1476,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
if (total_bytes >= max_bytes)
break;
if (!found) {
- *start = state->start;
+ *start = max(cur_start, state->start);
found = 1;
}
last = state->end;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c6a22d783c35..fa4ef18b66b1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (!btrfs_test_opt(root, AUTO_DEFRAG))
return 0;
- if (root->fs_info->closing)
+ if (btrfs_fs_closing(root->fs_info))
return 0;
if (BTRFS_I(inode)->in_defrag)
@@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (!defrag)
return -ENOMEM;
- defrag->ino = inode->i_ino;
+ defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
@@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
first_ino = defrag->ino + 1;
rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
- if (fs_info->closing)
+ if (btrfs_fs_closing(fs_info))
goto next_free;
spin_unlock(&fs_info->defrag_inodes_lock);
@@ -1480,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync)
* the current transaction, we can bail out now without any
* syncing
*/
- mutex_lock(&root->fs_info->trans_mutex);
+ smp_mb();
if (BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
- mutex_unlock(&root->fs_info->trans_mutex);
goto out;
}
- mutex_unlock(&root->fs_info->trans_mutex);
/*
* ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 70d45795d758..ad144736a5fd 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
return inode;
spin_lock(&block_group->lock);
- if (!root->fs_info->closing) {
+ if (!btrfs_fs_closing(root->fs_info)) {
block_group->inode = igrab(inode);
block_group->iref = 1;
}
@@ -402,7 +402,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
spin_unlock(&ctl->tree_lock);
- BUG_ON(ret);
+ if (ret) {
+ printk(KERN_ERR "Duplicate entries in "
+ "free space cache, dumping\n");
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ goto free_cache;
+ }
} else {
e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
if (!e->bitmap) {
@@ -419,6 +426,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ctl->op->recalc_thresholds(ctl);
spin_unlock(&ctl->tree_lock);
list_add_tail(&e->list, &bitmaps);
+ if (ret) {
+ printk(KERN_ERR "Duplicate entries in "
+ "free space cache, dumping\n");
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ goto free_cache;
+ }
}
num_entries--;
@@ -478,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
* If we're unmounting then just return, since this does a search on the
* normal root and not the commit root and we could deadlock.
*/
- smp_mb();
- if (fs_info->closing)
+ if (btrfs_fs_closing(fs_info))
return 0;
/*
@@ -575,10 +589,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
+
+ /* Since the first page has all of our checksums and our generation we
+ * need to calculate the offset into the page that we can start writing
+ * our entries.
+ */
+ first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
+
filemap_write_and_wait(inode->i_mapping);
btrfs_wait_ordered_range(inode, inode->i_size &
~(root->sectorsize - 1), (u64)-1);
+ /* make sure we don't overflow that first page */
+ if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
+ /* this is really the same as running out of space, where we also return 0 */
+ printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
+ ret = 0;
+ goto out_update;
+ }
+
/* We need a checksum per page. */
crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
if (!crc)
@@ -590,12 +619,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
return -1;
}
- /* Since the first page has all of our checksums and our generation we
- * need to calculate the offset into the page that we can start writing
- * our entries.
- */
- first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
-
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list))
cluster = list_entry(block_group->cluster_list.next,
@@ -857,12 +880,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
ret = 1;
out_free:
+ kfree(checksums);
+ kfree(pages);
+
+out_update:
if (ret != 1) {
invalidate_inode_pages2_range(inode->i_mapping, 0, index);
BTRFS_I(inode)->generation = 0;
}
- kfree(checksums);
- kfree(pages);
btrfs_update_inode(trans, root, inode);
return ret;
}
@@ -963,10 +988,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
* logically.
*/
if (bitmap) {
- WARN_ON(info->bitmap);
+ if (info->bitmap) {
+ WARN_ON_ONCE(1);
+ return -EEXIST;
+ }
p = &(*p)->rb_right;
} else {
- WARN_ON(!info->bitmap);
+ if (!info->bitmap) {
+ WARN_ON_ONCE(1);
+ return -EEXIST;
+ }
p = &(*p)->rb_left;
}
}
@@ -2481,7 +2512,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
return inode;
spin_lock(&root->cache_lock);
- if (!root->fs_info->closing)
+ if (!btrfs_fs_closing(root->fs_info))
root->cache_inode = igrab(inode);
spin_unlock(&root->cache_lock);
@@ -2504,12 +2535,14 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
int ret = 0;
u64 root_gen = btrfs_root_generation(&root->root_item);
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
/*
* If we're unmounting then just return, since this does a search on the
* normal root and not the commit root and we could deadlock.
*/
- smp_mb();
- if (fs_info->closing)
+ if (btrfs_fs_closing(fs_info))
return 0;
path = btrfs_alloc_path();
@@ -2543,6 +2576,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct inode *inode;
int ret;
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode))
return 0;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 3262cd17a12f..b4087e0fa871 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,6 +38,9 @@ static int caching_kthread(void *data)
int slot;
int ret;
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -59,8 +62,7 @@ again:
goto out;
while (1) {
- smp_mb();
- if (fs_info->closing)
+ if (btrfs_fs_closing(fs_info))
goto out;
leaf = path->nodes[0];
@@ -141,6 +143,9 @@ static void start_caching(struct btrfs_root *root)
int ret;
u64 objectid;
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return;
+
spin_lock(&root->cache_lock);
if (root->cached != BTRFS_CACHE_NO) {
spin_unlock(&root->cache_lock);
@@ -178,6 +183,9 @@ static void start_caching(struct btrfs_root *root)
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return btrfs_find_free_objectid(root, objectid);
+
again:
*objectid = btrfs_find_ino_for_alloc(root);
@@ -201,6 +209,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return;
+
again:
if (root->cached == BTRFS_CACHE_FINISHED) {
__btrfs_add_free_space(ctl, objectid, 1);
@@ -250,6 +262,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
struct rb_node *n;
u64 count;
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return;
+
while (1) {
n = rb_first(rbroot);
if (!n)
@@ -388,9 +403,24 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
int prealloc;
bool retry = false;
+ /* only fs tree and subvol/snap needs ino cache */
+ if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+ (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+ return 0;
+
+ /* Don't save inode cache if we are deleting this root */
+ if (btrfs_root_refs(&root->root_item) == 0 &&
+ root != root->fs_info->tree_root)
+ return 0;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb51bb1fa44f..ebf95f7a44d6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -138,7 +138,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->leave_spinning = 1;
- btrfs_set_trans_block_group(trans, inode);
key.objectid = btrfs_ino(inode);
key.offset = start;
@@ -426,9 +425,8 @@ again:
}
}
if (start == 0) {
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
- btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
/* lets try to make an inline extent */
@@ -623,8 +621,9 @@ retry:
async_extent->start + async_extent->ram_size - 1,
GFP_NOFS);
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_reserve_extent(trans, root,
async_extent->compressed_size,
async_extent->compressed_size,
@@ -793,9 +792,8 @@ static noinline int cow_file_range(struct inode *inode,
int ret = 0;
BUG_ON(is_free_space_inode(root, inode));
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
- btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -1077,10 +1075,12 @@ static noinline int run_delalloc_nocow(struct inode *inode,
nolock = is_free_space_inode(root, inode);
if (nolock)
- trans = btrfs_join_transaction_nolock(root, 1);
+ trans = btrfs_join_transaction_nolock(root);
else
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
+
BUG_ON(IS_ERR(trans));
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
cow_start = (u64)-1;
cur_offset = start;
@@ -1519,8 +1519,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
{
struct btrfs_ordered_sum *sum;
- btrfs_set_trans_block_group(trans, inode);
-
list_for_each_entry(sum, list, list) {
btrfs_csum_file_blocks(trans,
BTRFS_I(inode)->root->fs_info->csum_root, sum);
@@ -1735,11 +1733,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret) {
if (nolock)
- trans = btrfs_join_transaction_nolock(root, 1);
+ trans = btrfs_join_transaction_nolock(root);
else
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
- btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
@@ -1752,11 +1749,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
0, &cached_state, GFP_NOFS);
if (nolock)
- trans = btrfs_join_transaction_nolock(root, 1);
+ trans = btrfs_join_transaction_nolock(root);
else
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
- btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2431,7 +2427,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
(u64)-1);
if (root->orphan_block_rsv || root->orphan_item_inserted) {
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
btrfs_end_transaction(trans, root);
}
@@ -2511,12 +2507,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key location;
int maybe_acls;
- u64 alloc_group_block;
u32 rdev;
int ret;
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2526,6 +2522,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
+ if (!leaf->map_token)
+ map_private_extent_buffer(leaf, (unsigned long)inode_item,
+ sizeof(struct btrfs_inode_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
@@ -2555,8 +2557,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
BTRFS_I(inode)->index_cnt = (u64)-1;
BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
- alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
-
/*
* try to precache a NULL acl entry for files that don't have
* any xattrs or acls
@@ -2566,8 +2566,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
if (!maybe_acls)
cache_no_acl(inode);
- BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
- alloc_group_block, 0);
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
btrfs_free_path(path);
inode_item = NULL;
@@ -2647,7 +2650,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_inode_transid(leaf, item, trans->transid);
btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
- btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+ btrfs_set_inode_block_group(leaf, item, 0);
if (leaf->map_token) {
unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
@@ -3004,8 +3007,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, dir);
-
btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
@@ -3094,8 +3095,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, dir);
-
if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
err = btrfs_unlink_subvol(trans, root, dir,
BTRFS_I(inode)->location.objectid,
@@ -3514,7 +3513,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
err = PTR_ERR(trans);
break;
}
- btrfs_set_trans_block_group(trans, inode);
err = btrfs_drop_extents(trans, inode, cur_offset,
cur_offset + hole_size,
@@ -3650,7 +3648,6 @@ void btrfs_evict_inode(struct inode *inode)
while (1) {
trans = btrfs_start_transaction(root, 0);
BUG_ON(IS_ERR(trans));
- btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = root->orphan_block_rsv;
ret = btrfs_block_rsv_check(trans, root,
@@ -4133,7 +4130,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+
+ path->reada = 1;
if (key_type == BTRFS_DIR_INDEX_KEY) {
INIT_LIST_HEAD(&ins_list);
@@ -4268,18 +4266,16 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (BTRFS_I(inode)->dummy_inode)
return 0;
- smp_mb();
- if (root->fs_info->closing && is_free_space_inode(root, inode))
+ if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
nolock = true;
if (wbc->sync_mode == WB_SYNC_ALL) {
if (nolock)
- trans = btrfs_join_transaction_nolock(root, 1);
+ trans = btrfs_join_transaction_nolock(root);
else
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, inode);
if (nolock)
ret = btrfs_end_transaction_nolock(trans, root);
else
@@ -4294,7 +4290,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
-void btrfs_dirty_inode(struct inode *inode)
+void btrfs_dirty_inode(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -4303,9 +4299,8 @@ void btrfs_dirty_inode(struct inode *inode)
if (BTRFS_I(inode)->dummy_inode)
return;
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
- btrfs_set_trans_block_group(trans, inode);
ret = btrfs_update_inode(trans, root, inode);
if (ret && ret == -ENOSPC) {
@@ -4319,7 +4314,6 @@ void btrfs_dirty_inode(struct inode *inode)
PTR_ERR(trans));
return;
}
- btrfs_set_trans_block_group(trans, inode);
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
@@ -4418,8 +4412,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir,
const char *name, int name_len,
- u64 ref_objectid, u64 objectid,
- u64 alloc_hint, int mode, u64 *index)
+ u64 ref_objectid, u64 objectid, int mode,
+ u64 *index)
{
struct inode *inode;
struct btrfs_inode_item *inode_item;
@@ -4472,8 +4466,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
owner = 0;
else
owner = 1;
- BTRFS_I(inode)->block_group =
- btrfs_find_block_group(root, 0, alloc_hint, owner);
key[0].objectid = objectid;
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4629,15 +4621,13 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, dir);
-
err = btrfs_find_free_ino(root, &objectid);
if (err)
goto out_unlock;
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len, btrfs_ino(dir), objectid,
- BTRFS_I(dir)->block_group, mode, &index);
+ mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_unlock;
@@ -4649,7 +4639,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- btrfs_set_trans_block_group(trans, inode);
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
@@ -4658,8 +4647,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
btrfs_update_inode(trans, root, inode);
}
- btrfs_update_inode_block_group(trans, inode);
- btrfs_update_inode_block_group(trans, dir);
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
@@ -4692,15 +4679,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, dir);
-
err = btrfs_find_free_ino(root, &objectid);
if (err)
goto out_unlock;
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len, btrfs_ino(dir), objectid,
- BTRFS_I(dir)->block_group, mode, &index);
+ mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_unlock;
@@ -4712,7 +4697,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- btrfs_set_trans_block_group(trans, inode);
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
@@ -4723,8 +4707,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_file_inode_operations;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
- btrfs_update_inode_block_group(trans, inode);
- btrfs_update_inode_block_group(trans, dir);
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
@@ -4771,8 +4753,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_inc_nlink(inode);
inode->i_ctime = CURRENT_TIME;
-
- btrfs_set_trans_block_group(trans, dir);
ihold(inode);
err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -4781,7 +4761,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dget_parent(dentry);
- btrfs_update_inode_block_group(trans, dir);
err = btrfs_update_inode(trans, root, inode);
BUG_ON(err);
btrfs_log_new_name(trans, inode, NULL, parent);
@@ -4818,7 +4797,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_ino(root, &objectid);
if (err)
@@ -4826,8 +4804,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len, btrfs_ino(dir), objectid,
- BTRFS_I(dir)->block_group, S_IFDIR | mode,
- &index);
+ S_IFDIR | mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fail;
@@ -4841,7 +4818,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- btrfs_set_trans_block_group(trans, inode);
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, root, inode);
@@ -4855,8 +4831,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
d_instantiate(dentry, inode);
drop_on_err = 0;
- btrfs_update_inode_block_group(trans, inode);
- btrfs_update_inode_block_group(trans, dir);
out_fail:
nr = trans->blocks_used;
@@ -4989,7 +4963,15 @@ again:
if (!path) {
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /*
+ * Chances are we'll be called again, so go ahead and do
+ * readahead
+ */
+ path->reada = 1;
}
ret = btrfs_lookup_file_extent(trans, root, path,
@@ -5130,8 +5112,10 @@ again:
kunmap(page);
free_extent_map(em);
em = NULL;
+
btrfs_release_path(path);
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
+
if (IS_ERR(trans))
return ERR_CAST(trans);
goto again;
@@ -5375,7 +5359,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
}
- trans = btrfs_join_transaction(root, 0);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return ERR_CAST(trans);
@@ -5611,7 +5595,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* to make sure the current transaction stays open
* while we look for nocow cross refs
*/
- trans = btrfs_join_transaction(root, 0);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
goto must_cow;
@@ -5750,7 +5734,7 @@ again:
BUG_ON(!ordered);
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
err = -ENOMEM;
goto out;
@@ -6500,6 +6484,7 @@ out:
static int btrfs_truncate(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *rsv;
int ret;
int err = 0;
struct btrfs_trans_handle *trans;
@@ -6513,28 +6498,80 @@ static int btrfs_truncate(struct inode *inode)
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ /*
+ * Yes ladies and gentelment, this is indeed ugly. The fact is we have
+ * 3 things going on here
+ *
+ * 1) We need to reserve space for our orphan item and the space to
+ * delete our orphan item. Lord knows we don't want to have a dangling
+ * orphan item because we didn't reserve space to remove it.
+ *
+ * 2) We need to reserve space to update our inode.
+ *
+ * 3) We need to have something to cache all the space that is going to
+ * be free'd up by the truncate operation, but also have some slack
+ * space reserved in case it uses space during the truncate (thank you
+ * very much snapshotting).
+ *
+ * And we need these to all be seperate. The fact is we can use alot of
+ * space doing the truncate, and we have no earthly idea how much space
+ * we will use, so we need the truncate reservation to be seperate so it
+ * doesn't end up using space reserved for updating the inode or
+ * removing the orphan item. We also need to be able to stop the
+ * transaction and start a new one, which means we need to be able to
+ * update the inode several times, and we have no idea of knowing how
+ * many times that will be, so we can't just reserve 1 item for the
+ * entirety of the opration, so that has to be done seperately as well.
+ * Then there is the orphan item, which does indeed need to be held on
+ * to for the whole operation, and we need nobody to touch this reserved
+ * space except the orphan code.
+ *
+ * So that leaves us with
+ *
+ * 1) root->orphan_block_rsv - for the orphan deletion.
+ * 2) rsv - for the truncate reservation, which we will steal from the
+ * transaction reservation.
+ * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+ * updating the inode.
+ */
+ rsv = btrfs_alloc_block_rsv(root);
+ if (!rsv)
+ return -ENOMEM;
+ btrfs_add_durable_block_rsv(root->fs_info, rsv);
+
+ trans = btrfs_start_transaction(root, 4);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
- btrfs_set_trans_block_group(trans, inode);
+ /*
+ * Reserve space for the truncate process. Truncate should be adding
+ * space, but if there are snapshots it may end up using space.
+ */
+ ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+ BUG_ON(ret);
ret = btrfs_orphan_add(trans, inode);
if (ret) {
btrfs_end_transaction(trans, root);
- return ret;
+ goto out;
}
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
- /* Now start a transaction for the truncate */
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, inode);
- trans->block_rsv = root->orphan_block_rsv;
+ /*
+ * Ok so we've already migrated our bytes over for the truncate, so here
+ * just reserve the one slot we need for updating the inode.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+ trans->block_rsv = rsv;
/*
* setattr is responsible for setting the ordered_data_close flag,
@@ -6558,24 +6595,17 @@ static int btrfs_truncate(struct inode *inode)
while (1) {
if (!trans) {
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, inode);
- trans->block_rsv = root->orphan_block_rsv;
- }
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
- ret = btrfs_block_rsv_check(trans, root,
- root->orphan_block_rsv, 0, 5);
- if (ret == -EAGAIN) {
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- trans = NULL;
- continue;
- } else if (ret) {
- err = ret;
- break;
+ ret = btrfs_truncate_reserve_metadata(trans, root,
+ rsv);
+ BUG_ON(ret);
+
+ trans->block_rsv = rsv;
}
ret = btrfs_truncate_inode_items(trans, root, inode,
@@ -6586,6 +6616,7 @@ static int btrfs_truncate(struct inode *inode)
break;
}
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
err = ret;
@@ -6599,6 +6630,7 @@ static int btrfs_truncate(struct inode *inode)
}
if (ret == 0 && inode->i_nlink > 0) {
+ trans->block_rsv = root->orphan_block_rsv;
ret = btrfs_orphan_del(trans, inode);
if (ret)
err = ret;
@@ -6610,15 +6642,20 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_orphan_del(NULL, inode);
}
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
if (ret && !err)
err = ret;
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+
+out:
+ btrfs_free_block_rsv(root, rsv);
+
if (ret && !err)
err = ret;
- btrfs_btree_balance_dirty(root, nr);
return err;
}
@@ -6627,15 +6664,14 @@ static int btrfs_truncate(struct inode *inode)
* create a new subvolume directory/inode (helper for the ioctl).
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- u64 new_dirid, u64 alloc_hint)
+ struct btrfs_root *new_root, u64 new_dirid)
{
struct inode *inode;
int err;
u64 index = 0;
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
- new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+ new_dirid, S_IFDIR | 0700, &index);
if (IS_ERR(inode))
return PTR_ERR(inode);
inode->i_op = &btrfs_dir_inode_operations;
@@ -6748,21 +6784,6 @@ void btrfs_destroy_inode(struct inode *inode)
spin_unlock(&root->fs_info->ordered_extent_lock);
}
- if (root == root->fs_info->tree_root) {
- struct btrfs_block_group_cache *block_group;
-
- block_group = btrfs_lookup_block_group(root->fs_info,
- BTRFS_I(inode)->block_group);
- if (block_group && block_group->inode == inode) {
- spin_lock(&block_group->lock);
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- btrfs_put_block_group(block_group);
- } else if (block_group) {
- btrfs_put_block_group(block_group);
- }
- }
-
spin_lock(&root->orphan_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
@@ -6948,8 +6969,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_notrans;
}
- btrfs_set_trans_block_group(trans, new_dir);
-
if (dest != root)
btrfs_record_root_in_trans(trans, dest);
@@ -7131,16 +7150,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, dir);
-
err = btrfs_find_free_ino(root, &objectid);
if (err)
goto out_unlock;
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len, btrfs_ino(dir), objectid,
- BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
- &index);
+ S_IFLNK|S_IRWXUGO, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_unlock;
@@ -7152,7 +7168,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- btrfs_set_trans_block_group(trans, inode);
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
drop_inode = 1;
@@ -7163,8 +7178,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_file_inode_operations;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
- btrfs_update_inode_block_group(trans, inode);
- btrfs_update_inode_block_group(trans, dir);
if (drop_inode)
goto out_unlock;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 85e818ce00c5..ac37040e426a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -243,7 +243,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
- trans = btrfs_join_transaction(root, 1);
+ trans = btrfs_join_transaction(root);
BUG_ON(IS_ERR(trans));
ret = btrfs_update_inode(trans, root, inode);
@@ -414,8 +414,7 @@ static noinline int create_subvol(struct btrfs_root *root,
btrfs_record_root_in_trans(trans, new_root);
- ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
- BTRFS_I(dir)->block_group);
+ ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
/*
* insert the directory item
*/
@@ -707,16 +706,17 @@ static int find_new_extents(struct btrfs_root *root,
struct btrfs_file_extent_item *extent;
int type;
int ret;
+ u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- min_key.objectid = inode->i_ino;
+ min_key.objectid = ino;
min_key.type = BTRFS_EXTENT_DATA_KEY;
min_key.offset = *off;
- max_key.objectid = inode->i_ino;
+ max_key.objectid = ino;
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
@@ -727,7 +727,7 @@ static int find_new_extents(struct btrfs_root *root,
path, 0, newer_than);
if (ret != 0)
goto none;
- if (min_key.objectid != inode->i_ino)
+ if (min_key.objectid != ino)
goto none;
if (min_key.type != BTRFS_EXTENT_DATA_KEY)
goto none;
@@ -2489,12 +2489,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
if (ret)
goto out;
- mutex_lock(&root->fs_info->trans_mutex);
- root->fs_info->open_ioctl_trans++;
- mutex_unlock(&root->fs_info->trans_mutex);
+ atomic_inc(&root->fs_info->open_ioctl_trans);
ret = -ENOMEM;
- trans = btrfs_start_ioctl_transaction(root, 0);
+ trans = btrfs_start_ioctl_transaction(root);
if (IS_ERR(trans))
goto out_drop;
@@ -2502,9 +2500,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
return 0;
out_drop:
- mutex_lock(&root->fs_info->trans_mutex);
- root->fs_info->open_ioctl_trans--;
- mutex_unlock(&root->fs_info->trans_mutex);
+ atomic_dec(&root->fs_info->open_ioctl_trans);
mnt_drop_write(file->f_path.mnt);
out:
return ret;
@@ -2738,9 +2734,7 @@ long btrfs_ioctl_trans_end(struct file *file)
btrfs_end_transaction(trans, root);
- mutex_lock(&root->fs_info->trans_mutex);
- root->fs_info->open_ioctl_trans--;
- mutex_unlock(&root->fs_info->trans_mutex);
+ atomic_dec(&root->fs_info->open_ioctl_trans);
mnt_drop_write(file->f_path.mnt);
return 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ca38eca70af0..b1ef27cc673b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -677,6 +677,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
err = -ENOMEM;
goto out;
}
+ path1->reada = 1;
+ path2->reada = 2;
node = alloc_backref_node(cache);
if (!node) {
@@ -1999,6 +2001,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->reada = 1;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -2139,10 +2142,10 @@ int prepare_to_merge(struct reloc_control *rc, int err)
u64 num_bytes = 0;
int ret;
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&root->fs_info->trans_lock);
rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
rc->merging_rsv_size += rc->nodes_relocated * 2;
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->trans_lock);
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
@@ -2152,7 +2155,7 @@ again:
err = ret;
}
- trans = btrfs_join_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
if (!err)
btrfs_block_rsv_release(rc->extent_root,
@@ -2211,9 +2214,9 @@ int merge_reloc_roots(struct reloc_control *rc)
int ret;
again:
root = rc->extent_root;
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&root->fs_info->trans_lock);
list_splice_init(&rc->reloc_roots, &reloc_roots);
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->trans_lock);
while (!list_empty(&reloc_roots)) {
found = 1;
@@ -3236,7 +3239,7 @@ truncate:
goto out;
}
- trans = btrfs_join_transaction(root, 0);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_free_path(path);
ret = PTR_ERR(trans);
@@ -3300,6 +3303,7 @@ static int find_data_references(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->reada = 1;
root = read_fs_root(rc->extent_root->fs_info, ref_root);
if (IS_ERR(root)) {
@@ -3586,17 +3590,17 @@ next:
static void set_reloc_control(struct reloc_control *rc)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- mutex_lock(&fs_info->trans_mutex);
+ spin_lock(&fs_info->trans_lock);
fs_info->reloc_ctl = rc;
- mutex_unlock(&fs_info->trans_mutex);
+ spin_unlock(&fs_info->trans_lock);
}
static void unset_reloc_control(struct reloc_control *rc)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- mutex_lock(&fs_info->trans_mutex);
+ spin_lock(&fs_info->trans_lock);
fs_info->reloc_ctl = NULL;
- mutex_unlock(&fs_info->trans_mutex);
+ spin_unlock(&fs_info->trans_lock);
}
static int check_extent_flags(u64 flags)
@@ -3645,7 +3649,7 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->create_reloc_tree = 1;
set_reloc_control(rc);
- trans = btrfs_join_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root);
BUG_ON(IS_ERR(trans));
btrfs_commit_transaction(trans, rc->extent_root);
return 0;
@@ -3668,6 +3672,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->reada = 1;
ret = prepare_to_relocate(rc);
if (ret) {
@@ -3834,7 +3839,7 @@ restart:
btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
/* get rid of pinned extents */
- trans = btrfs_join_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans))
err = PTR_ERR(trans);
else
@@ -4093,6 +4098,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->reada = -1;
key.objectid = BTRFS_TREE_RELOC_OBJECTID;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4159,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
set_reloc_control(rc);
- trans = btrfs_join_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
unset_reloc_control(rc);
err = PTR_ERR(trans);
@@ -4193,7 +4199,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
unset_reloc_control(rc);
- trans = btrfs_join_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans))
err = PTR_ERR(trans);
else
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dfed0c27ac3..df50fd1eca8f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -117,33 +117,37 @@ static void scrub_free_csums(struct scrub_dev *sdev)
}
}
+static void scrub_free_bio(struct bio *bio)
+{
+ int i;
+ struct page *last_page = NULL;
+
+ if (!bio)
+ return;
+
+ for (i = 0; i < bio->bi_vcnt; ++i) {
+ if (bio->bi_io_vec[i].bv_page == last_page)
+ continue;
+ last_page = bio->bi_io_vec[i].bv_page;
+ __free_page(last_page);
+ }
+ bio_put(bio);
+}
+
static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
{
int i;
- int j;
- struct page *last_page;
if (!sdev)
return;
for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
struct scrub_bio *sbio = sdev->bios[i];
- struct bio *bio;
if (!sbio)
break;
- bio = sbio->bio;
- if (bio) {
- last_page = NULL;
- for (j = 0; j < bio->bi_vcnt; ++j) {
- if (bio->bi_io_vec[j].bv_page == last_page)
- continue;
- last_page = bio->bi_io_vec[j].bv_page;
- __free_page(last_page);
- }
- bio_put(bio);
- }
+ scrub_free_bio(sbio->bio);
kfree(sbio);
}
@@ -156,8 +160,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
{
struct scrub_dev *sdev;
int i;
- int j;
- int ret;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
@@ -165,7 +167,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
goto nomem;
sdev->dev = dev;
for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
- struct bio *bio;
struct scrub_bio *sbio;
sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
@@ -173,32 +174,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
goto nomem;
sdev->bios[i] = sbio;
- bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
- if (!bio)
- goto nomem;
-
sbio->index = i;
sbio->sdev = sdev;
- sbio->bio = bio;
sbio->count = 0;
sbio->work.func = scrub_checksum;
- bio->bi_private = sdev->bios[i];
- bio->bi_end_io = scrub_bio_end_io;
- bio->bi_sector = 0;
- bio->bi_bdev = dev->bdev;
- bio->bi_size = 0;
-
- for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
- struct page *page;
- page = alloc_page(GFP_NOFS);
- if (!page)
- goto nomem;
-
- ret = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (!ret)
- goto nomem;
- }
- WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
if (i != SCRUB_BIOS_PER_DEV-1)
sdev->bios[i]->next_free = i + 1;
@@ -369,9 +348,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
int ret;
DECLARE_COMPLETION_ONSTACK(complete);
- /* we are going to wait on this IO */
- rw |= REQ_SYNC;
-
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_bdev = bdev;
bio->bi_sector = sector;
@@ -380,6 +356,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
bio->bi_private = &complete;
submit_bio(rw, bio);
+ /* this will also unplug the queue */
wait_for_completion(&complete);
ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -394,6 +371,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
sbio->err = err;
+ sbio->bio = bio;
btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
}
@@ -453,6 +431,8 @@ static void scrub_checksum(struct btrfs_work *work)
}
out:
+ scrub_free_bio(sbio->bio);
+ sbio->bio = NULL;
spin_lock(&sdev->list_lock);
sbio->next_free = sdev->first_free;
sdev->first_free = sbio->index;
@@ -583,25 +563,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
static int scrub_submit(struct scrub_dev *sdev)
{
struct scrub_bio *sbio;
+ struct bio *bio;
+ int i;
if (sdev->curr == -1)
return 0;
sbio = sdev->bios[sdev->curr];
- sbio->bio->bi_sector = sbio->physical >> 9;
- sbio->bio->bi_size = sbio->count * PAGE_SIZE;
- sbio->bio->bi_next = NULL;
- sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
- sbio->bio->bi_comp_cpu = -1;
- sbio->bio->bi_bdev = sdev->dev->bdev;
+ bio = bio_alloc(GFP_NOFS, sbio->count);
+ if (!bio)
+ goto nomem;
+
+ bio->bi_private = sbio;
+ bio->bi_end_io = scrub_bio_end_io;
+ bio->bi_bdev = sdev->dev->bdev;
+ bio->bi_sector = sbio->physical >> 9;
+
+ for (i = 0; i < sbio->count; ++i) {
+ struct page *page;
+ int ret;
+
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ goto nomem;
+
+ ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+ if (!ret) {
+ __free_page(page);
+ goto nomem;
+ }
+ }
+
sbio->err = 0;
sdev->curr = -1;
atomic_inc(&sdev->in_flight);
- submit_bio(0, sbio->bio);
+ submit_bio(READ, bio);
return 0;
+
+nomem:
+ scrub_free_bio(bio);
+
+ return -ENOMEM;
}
static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -633,7 +638,11 @@ again:
sbio->logical = logical;
} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
sbio->logical + sbio->count * PAGE_SIZE != logical) {
- scrub_submit(sdev);
+ int ret;
+
+ ret = scrub_submit(sdev);
+ if (ret)
+ return ret;
goto again;
}
sbio->spag[sbio->count].flags = flags;
@@ -645,8 +654,13 @@ again:
memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
}
++sbio->count;
- if (sbio->count == SCRUB_PAGES_PER_BIO || force)
- scrub_submit(sdev);
+ if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+ int ret;
+
+ ret = scrub_submit(sdev);
+ if (ret)
+ return ret;
+ }
return 0;
}
@@ -727,6 +741,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
+ struct blk_plug plug;
u64 flags;
int ret;
int slot;
@@ -831,6 +846,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
* the scrub. This might currently (crc32) end up to be about 1MB
*/
start_stripe = 0;
+ blk_start_plug(&plug);
again:
logical = base + offset + start_stripe * increment;
for (i = start_stripe; i < nstripes; ++i) {
@@ -972,6 +988,7 @@ next:
scrub_submit(sdev);
out:
+ blk_finish_plug(&plug);
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
@@ -1166,7 +1183,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
int ret;
struct btrfs_device *dev;
- if (root->fs_info->closing)
+ if (btrfs_fs_closing(root->fs_info))
return -EINVAL;
/*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9b2e7e5bc3ef..117e74e3604b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -161,7 +161,8 @@ enum {
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
+ Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+ Opt_inode_cache, Opt_err,
};
static match_table_t tokens = {
@@ -193,6 +194,7 @@ static match_table_t tokens = {
{Opt_enospc_debug, "enospc_debug"},
{Opt_subvolrootid, "subvolrootid=%d"},
{Opt_defrag, "autodefrag"},
+ {Opt_inode_cache, "inode_cache"},
{Opt_err, NULL},
};
@@ -361,6 +363,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling disk space caching\n");
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
break;
+ case Opt_inode_cache:
+ printk(KERN_INFO "btrfs: enabling inode map caching\n");
+ btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+ break;
case Opt_clear_cache:
printk(KERN_INFO "btrfs: force clearing of disk cache\n");
btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc80f7156923..dd719662340e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,6 +35,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
+ BUG_ON(!list_empty(&transaction->list));
memset(transaction, 0, sizeof(*transaction));
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
@@ -49,46 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root)
/*
* either allocate a new transaction or hop into the existing one
*/
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
{
struct btrfs_transaction *cur_trans;
+
+ spin_lock(&root->fs_info->trans_lock);
+ if (root->fs_info->trans_no_join) {
+ if (!nofail) {
+ spin_unlock(&root->fs_info->trans_lock);
+ return -EBUSY;
+ }
+ }
+
cur_trans = root->fs_info->running_transaction;
- if (!cur_trans) {
- cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
- GFP_NOFS);
- if (!cur_trans)
- return -ENOMEM;
- root->fs_info->generation++;
- atomic_set(&cur_trans->num_writers, 1);
- cur_trans->num_joined = 0;
- cur_trans->transid = root->fs_info->generation;
- init_waitqueue_head(&cur_trans->writer_wait);
- init_waitqueue_head(&cur_trans->commit_wait);
- cur_trans->in_commit = 0;
- cur_trans->blocked = 0;
- atomic_set(&cur_trans->use_count, 1);
- cur_trans->commit_done = 0;
- cur_trans->start_time = get_seconds();
-
- cur_trans->delayed_refs.root = RB_ROOT;
- cur_trans->delayed_refs.num_entries = 0;
- cur_trans->delayed_refs.num_heads_ready = 0;
- cur_trans->delayed_refs.num_heads = 0;
- cur_trans->delayed_refs.flushing = 0;
- cur_trans->delayed_refs.run_delayed_start = 0;
- spin_lock_init(&cur_trans->delayed_refs.lock);
-
- INIT_LIST_HEAD(&cur_trans->pending_snapshots);
- list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
- extent_io_tree_init(&cur_trans->dirty_pages,
- root->fs_info->btree_inode->i_mapping);
- spin_lock(&root->fs_info->new_trans_lock);
- root->fs_info->running_transaction = cur_trans;
- spin_unlock(&root->fs_info->new_trans_lock);
- } else {
+ if (cur_trans) {
+ atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
+ spin_unlock(&root->fs_info->trans_lock);
+ return 0;
}
+ spin_unlock(&root->fs_info->trans_lock);
+
+ cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+ if (!cur_trans)
+ return -ENOMEM;
+ spin_lock(&root->fs_info->trans_lock);
+ if (root->fs_info->running_transaction) {
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ cur_trans = root->fs_info->running_transaction;
+ atomic_inc(&cur_trans->use_count);
+ atomic_inc(&cur_trans->num_writers);
+ cur_trans->num_joined++;
+ spin_unlock(&root->fs_info->trans_lock);
+ return 0;
+ }
+ atomic_set(&cur_trans->num_writers, 1);
+ cur_trans->num_joined = 0;
+ init_waitqueue_head(&cur_trans->writer_wait);
+ init_waitqueue_head(&cur_trans->commit_wait);
+ cur_trans->in_commit = 0;
+ cur_trans->blocked = 0;
+ /*
+ * One for this trans handle, one so it will live on until we
+ * commit the transaction.
+ */
+ atomic_set(&cur_trans->use_count, 2);
+ cur_trans->commit_done = 0;
+ cur_trans->start_time = get_seconds();
+
+ cur_trans->delayed_refs.root = RB_ROOT;
+ cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.num_heads = 0;
+ cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
+ spin_lock_init(&cur_trans->commit_lock);
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
+ INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+ extent_io_tree_init(&cur_trans->dirty_pages,
+ root->fs_info->btree_inode->i_mapping);
+ root->fs_info->generation++;
+ cur_trans->transid = root->fs_info->generation;
+ root->fs_info->running_transaction = cur_trans;
+ spin_unlock(&root->fs_info->trans_lock);
return 0;
}
@@ -99,39 +126,28 @@ static noinline int join_transaction(struct btrfs_root *root)
* to make sure the old root from before we joined the transaction is deleted
* when the transaction commits
*/
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
if (root->ref_cows && root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ if (root->last_trans == trans->transid) {
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ return 0;
+ }
+ root->last_trans = trans->transid;
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
- root->last_trans = trans->transid;
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
btrfs_init_reloc_root(trans, root);
}
return 0;
}
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- if (!root->ref_cows)
- return 0;
-
- mutex_lock(&root->fs_info->trans_mutex);
- if (root->last_trans == trans->transid) {
- mutex_unlock(&root->fs_info->trans_mutex);
- return 0;
- }
-
- record_root_in_trans(trans, root);
- mutex_unlock(&root->fs_info->trans_mutex);
- return 0;
-}
-
/* wait for commit against the current transaction to become unblocked
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
@@ -140,21 +156,23 @@ static void wait_current_trans(struct btrfs_root *root)
{
struct btrfs_transaction *cur_trans;
+ spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
if (cur_trans && cur_trans->blocked) {
DEFINE_WAIT(wait);
atomic_inc(&cur_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
while (1) {
prepare_to_wait(&root->fs_info->transaction_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (!cur_trans->blocked)
break;
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
}
finish_wait(&root->fs_info->transaction_wait, &wait);
put_transaction(cur_trans);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
}
}
@@ -167,10 +185,16 @@ enum btrfs_trans_type {
static int may_wait_transaction(struct btrfs_root *root, int type)
{
- if (!root->fs_info->log_root_recovering &&
- ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
- type == TRANS_USERSPACE))
+ if (root->fs_info->log_root_recovering)
+ return 0;
+
+ if (type == TRANS_USERSPACE)
+ return 1;
+
+ if (type == TRANS_START &&
+ !atomic_read(&root->fs_info->open_ioctl_trans))
return 1;
+
return 0;
}
@@ -184,36 +208,44 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
return ERR_PTR(-EROFS);
+
+ if (current->journal_info) {
+ WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+ h = current->journal_info;
+ h->use_count++;
+ h->orig_rsv = h->block_rsv;
+ h->block_rsv = NULL;
+ goto got_it;
+ }
again:
h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h)
return ERR_PTR(-ENOMEM);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_lock(&root->fs_info->trans_mutex);
if (may_wait_transaction(root, type))
wait_current_trans(root);
- ret = join_transaction(root);
+ do {
+ ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+ if (ret == -EBUSY)
+ wait_current_trans(root);
+ } while (ret == -EBUSY);
+
if (ret < 0) {
kmem_cache_free(btrfs_trans_handle_cachep, h);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_unlock(&root->fs_info->trans_mutex);
return ERR_PTR(ret);
}
cur_trans = root->fs_info->running_transaction;
- atomic_inc(&cur_trans->use_count);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_unlock(&root->fs_info->trans_mutex);
h->transid = cur_trans->transid;
h->transaction = cur_trans;
h->blocks_used = 0;
- h->block_group = 0;
h->bytes_reserved = 0;
h->delayed_ref_updates = 0;
+ h->use_count = 1;
h->block_rsv = NULL;
+ h->orig_rsv = NULL;
smp_mb();
if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -241,11 +273,8 @@ again:
}
}
- if (type != TRANS_JOIN_NOLOCK)
- mutex_lock(&root->fs_info->trans_mutex);
- record_root_in_trans(h, root);
- if (type != TRANS_JOIN_NOLOCK)
- mutex_unlock(&root->fs_info->trans_mutex);
+got_it:
+ btrfs_record_root_in_trans(h, root);
if (!current->journal_info && type != TRANS_USERSPACE)
current->journal_info = h;
@@ -257,22 +286,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
{
return start_transaction(root, num_items, TRANS_START);
}
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
- int num_blocks)
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN);
}
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
- int num_blocks)
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
- int num_blocks)
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
{
- return start_transaction(r, 0, TRANS_USERSPACE);
+ return start_transaction(root, 0, TRANS_USERSPACE);
}
/* wait for a transaction commit to be fully complete */
@@ -280,17 +306,13 @@ static noinline int wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
{
DEFINE_WAIT(wait);
- mutex_lock(&root->fs_info->trans_mutex);
while (!commit->commit_done) {
prepare_to_wait(&commit->commit_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (commit->commit_done)
break;
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
}
- mutex_unlock(&root->fs_info->trans_mutex);
finish_wait(&commit->commit_wait, &wait);
return 0;
}
@@ -300,59 +322,56 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
struct btrfs_transaction *cur_trans = NULL, *t;
int ret;
- mutex_lock(&root->fs_info->trans_mutex);
-
ret = 0;
if (transid) {
if (transid <= root->fs_info->last_trans_committed)
- goto out_unlock;
+ goto out;
/* find specified transaction */
+ spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
if (t->transid == transid) {
cur_trans = t;
+ atomic_inc(&cur_trans->use_count);
break;
}
if (t->transid > transid)
break;
}
+ spin_unlock(&root->fs_info->trans_lock);
ret = -EINVAL;
if (!cur_trans)
- goto out_unlock; /* bad transid */
+ goto out; /* bad transid */
} else {
/* find newest transaction that is committing | committed */
+ spin_lock(&root->fs_info->trans_lock);
list_for_each_entry_reverse(t, &root->fs_info->trans_list,
list) {
if (t->in_commit) {
if (t->commit_done)
- goto out_unlock;
+ goto out;
cur_trans = t;
+ atomic_inc(&cur_trans->use_count);
break;
}
}
+ spin_unlock(&root->fs_info->trans_lock);
if (!cur_trans)
- goto out_unlock; /* nothing committing|committed */
+ goto out; /* nothing committing|committed */
}
- atomic_inc(&cur_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
-
wait_for_commit(root, cur_trans);
- mutex_lock(&root->fs_info->trans_mutex);
put_transaction(cur_trans);
ret = 0;
-out_unlock:
- mutex_unlock(&root->fs_info->trans_mutex);
+out:
return ret;
}
void btrfs_throttle(struct btrfs_root *root)
{
- mutex_lock(&root->fs_info->trans_mutex);
- if (!root->fs_info->open_ioctl_trans)
+ if (!atomic_read(&root->fs_info->open_ioctl_trans))
wait_current_trans(root);
- mutex_unlock(&root->fs_info->trans_mutex);
}
static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -370,6 +389,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *cur_trans = trans->transaction;
int updates;
+ smp_mb();
if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
return 1;
@@ -388,6 +408,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *info = root->fs_info;
int count = 0;
+ if (--trans->use_count) {
+ trans->block_rsv = trans->orig_rsv;
+ return 0;
+ }
+
while (count < 4) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
@@ -410,9 +435,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
- if (lock && !root->fs_info->open_ioctl_trans &&
- should_end_transaction(trans, root))
+ if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+ should_end_transaction(trans, root)) {
trans->transaction->blocked = 1;
+ smp_wmb();
+ }
if (lock && cur_trans->blocked && !cur_trans->in_commit) {
if (throttle)
@@ -703,9 +730,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
*/
int btrfs_add_dead_root(struct btrfs_root *root)
{
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&root->fs_info->trans_lock);
list_add(&root->root_list, &root->fs_info->dead_roots);
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->trans_lock);
return 0;
}
@@ -721,6 +748,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
int ret;
int err = 0;
+ spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
(void **)gang, 0,
@@ -733,6 +761,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
@@ -753,10 +782,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
err = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
+ spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
}
}
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return err;
}
@@ -786,7 +817,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
btrfs_btree_balance_dirty(info->tree_root, nr);
cond_resched();
- if (root->fs_info->closing || ret != -EAGAIN)
+ if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
break;
}
root->defrag_running = 0;
@@ -851,7 +882,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
parent = dget_parent(dentry);
parent_inode = parent->d_inode;
parent_root = BTRFS_I(parent_inode)->root;
- record_root_in_trans(trans, parent_root);
+ btrfs_record_root_in_trans(trans, parent_root);
/*
* insert the directory item
@@ -869,7 +900,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
- record_root_in_trans(trans, root);
+ btrfs_record_root_in_trans(trans, root);
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@@ -967,20 +998,20 @@ static void update_super_roots(struct btrfs_root *root)
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
int ret = 0;
- spin_lock(&info->new_trans_lock);
+ spin_lock(&info->trans_lock);
if (info->running_transaction)
ret = info->running_transaction->in_commit;
- spin_unlock(&info->new_trans_lock);
+ spin_unlock(&info->trans_lock);
return ret;
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
int ret = 0;
- spin_lock(&info->new_trans_lock);
+ spin_lock(&info->trans_lock);
if (info->running_transaction)
ret = info->running_transaction->blocked;
- spin_unlock(&info->new_trans_lock);
+ spin_unlock(&info->trans_lock);
return ret;
}
@@ -1004,9 +1035,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
&wait);
break;
}
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
}
}
@@ -1032,9 +1061,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
&wait);
break;
}
- mutex_unlock(&root->fs_info->trans_mutex);
schedule();
- mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&root->fs_info->transaction_wait,
&wait);
}
@@ -1072,7 +1099,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
INIT_DELAYED_WORK(&ac->work, do_async_commit);
ac->root = root;
- ac->newtrans = btrfs_join_transaction(root, 0);
+ ac->newtrans = btrfs_join_transaction(root);
if (IS_ERR(ac->newtrans)) {
int err = PTR_ERR(ac->newtrans);
kfree(ac);
@@ -1080,22 +1107,18 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
}
/* take transaction reference */
- mutex_lock(&root->fs_info->trans_mutex);
cur_trans = trans->transaction;
atomic_inc(&cur_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
schedule_delayed_work(&ac->work, 0);
/* wait for transaction to start and unblock */
- mutex_lock(&root->fs_info->trans_mutex);
if (wait_for_unblock)
wait_current_trans_commit_start_and_unblock(root, cur_trans);
else
wait_current_trans_commit_start(root, cur_trans);
put_transaction(cur_trans);
- mutex_unlock(&root->fs_info->trans_mutex);
return 0;
}
@@ -1139,38 +1162,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&cur_trans->commit_lock);
if (cur_trans->in_commit) {
+ spin_unlock(&cur_trans->commit_lock);
atomic_inc(&cur_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
ret = wait_for_commit(root, cur_trans);
BUG_ON(ret);
- mutex_lock(&root->fs_info->trans_mutex);
put_transaction(cur_trans);
- mutex_unlock(&root->fs_info->trans_mutex);
return 0;
}
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
+ spin_unlock(&cur_trans->commit_lock);
wake_up(&root->fs_info->transaction_blocked_wait);
+ spin_lock(&root->fs_info->trans_lock);
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (!prev_trans->commit_done) {
atomic_inc(&prev_trans->use_count);
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->trans_lock);
wait_for_commit(root, prev_trans);
- mutex_lock(&root->fs_info->trans_mutex);
put_transaction(prev_trans);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
}
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
}
if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1178,12 +1204,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
do {
int snap_pending = 0;
+
joined = cur_trans->num_joined;
if (!list_empty(&trans->transaction->pending_snapshots))
snap_pending = 1;
WARN_ON(cur_trans != trans->transaction);
- mutex_unlock(&root->fs_info->trans_mutex);
if (flush_on_commit || snap_pending) {
btrfs_start_delalloc_inodes(root, 1);
@@ -1206,14 +1232,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
prepare_to_wait(&cur_trans->writer_wait, &wait,
TASK_UNINTERRUPTIBLE);
- smp_mb();
if (atomic_read(&cur_trans->num_writers) > 1)
schedule_timeout(MAX_SCHEDULE_TIMEOUT);
else if (should_grow)
schedule_timeout(1);
- mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&cur_trans->writer_wait, &wait);
+ spin_lock(&root->fs_info->trans_lock);
+ root->fs_info->trans_no_join = 1;
+ spin_unlock(&root->fs_info->trans_lock);
} while (atomic_read(&cur_trans->num_writers) > 1 ||
(should_grow && cur_trans->num_joined != joined));
@@ -1258,9 +1285,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_prepare_extent_commit(trans, root);
cur_trans = root->fs_info->running_transaction;
- spin_lock(&root->fs_info->new_trans_lock);
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->new_trans_lock);
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
@@ -1281,10 +1305,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
sizeof(root->fs_info->super_copy));
trans->transaction->blocked = 0;
+ spin_lock(&root->fs_info->trans_lock);
+ root->fs_info->running_transaction = NULL;
+ root->fs_info->trans_no_join = 0;
+ spin_unlock(&root->fs_info->trans_lock);
wake_up(&root->fs_info->transaction_wait);
- mutex_unlock(&root->fs_info->trans_mutex);
ret = btrfs_write_and_wait_transaction(trans, root);
BUG_ON(ret);
write_ctree_super(trans, root, 0);
@@ -1297,22 +1324,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- mutex_lock(&root->fs_info->trans_mutex);
-
cur_trans->commit_done = 1;
root->fs_info->last_trans_committed = cur_trans->transid;
wake_up(&cur_trans->commit_wait);
+ spin_lock(&root->fs_info->trans_lock);
list_del_init(&cur_trans->list);
+ spin_unlock(&root->fs_info->trans_lock);
+
put_transaction(cur_trans);
put_transaction(cur_trans);
trace_btrfs_transaction_commit(root);
- mutex_unlock(&root->fs_info->trans_mutex);
-
btrfs_scrub_continue(root);
if (current->journal_info == trans)
@@ -1334,9 +1360,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
LIST_HEAD(list);
struct btrfs_fs_info *fs_info = root->fs_info;
- mutex_lock(&fs_info->trans_mutex);
+ spin_lock(&fs_info->trans_lock);
list_splice_init(&fs_info->dead_roots, &list);
- mutex_unlock(&fs_info->trans_mutex);
+ spin_unlock(&fs_info->trans_lock);
while (!list_empty(&list)) {
root = list_entry(list.next, struct btrfs_root, root_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 804c88639e5d..02564e6230ac 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -28,10 +28,12 @@ struct btrfs_transaction {
* transaction can end
*/
atomic_t num_writers;
+ atomic_t use_count;
unsigned long num_joined;
+
+ spinlock_t commit_lock;
int in_commit;
- atomic_t use_count;
int commit_done;
int blocked;
struct list_head list;
@@ -45,13 +47,14 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
- u64 block_group;
u64 bytes_reserved;
+ unsigned long use_count;
unsigned long blocks_reserved;
unsigned long blocks_used;
unsigned long delayed_ref_updates;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *orig_rsv;
};
struct btrfs_pending_snapshot {
@@ -66,19 +69,6 @@ struct btrfs_pending_snapshot {
struct list_head list;
};
-static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
- struct inode *inode)
-{
- trans->block_group = BTRFS_I(inode)->block_group;
-}
-
-static inline void btrfs_update_inode_block_group(
- struct btrfs_trans_handle *trans,
- struct inode *inode)
-{
- BTRFS_I(inode)->block_group = trans->block_group;
-}
-
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
@@ -92,12 +82,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items);
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
- int num_blocks);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
- int num_blocks);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
- int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c48214ef5c09..da541dfca2e3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
BUG_ON(!new_device);
memcpy(new_device, device, sizeof(*new_device));
new_device->name = kstrdup(device->name, GFP_NOFS);
- BUG_ON(!new_device->name);
+ BUG_ON(device->name && !new_device->name);
new_device->bdev = NULL;
new_device->writeable = 0;
new_device->in_fs_metadata = 0;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f3107e4b4d56..5366fe452ab0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_set_trans_block_group(trans, inode);
-
ret = do_setxattr(trans, inode, name, value, size, flags);
if (ret)
goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 698c6b2cc462..49c9aada0374 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2382,6 +2382,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
ret = -EAGAIN;
goto out_unlock;
}
+ wait_on_page_writeback(page);
return 0;
out_unlock:
unlock_page(page);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 8f1700623b41..21de1d6d5849 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -74,8 +74,9 @@ shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
* Run idmap cache shrinker.
*/
static int
-cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
{
+ int nr_to_scan = sc->nr_to_scan;
int nr_del = 0;
int nr_rem = 0;
struct rb_root *root;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a46126fd5735..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
int len = de->d_name.len;
int error;
- dentry_unhash(de);
-
error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
if (!error) {
/* VFS may delete the child */
@@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
int new_length = new_dentry->d_name.len;
int error;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
coda_i2f(new_dir), old_length, new_length,
(const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9d17d350abc5..9a37a9b6de3a 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
- dentry_unhash(dentry);
-
if (dentry->d_parent == configfs_sb->s_root)
return -EPERM;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b8d5c8091024..58609bde3b9f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1024,25 +1024,25 @@ out:
}
/**
- * contains_ecryptfs_marker - check for the ecryptfs marker
+ * ecryptfs_validate_marker - check for the ecryptfs marker
* @data: The data block in which to check
*
- * Returns one if marker found; zero if not found
+ * Returns zero if marker found; -EINVAL if not found
*/
-static int contains_ecryptfs_marker(char *data)
+static int ecryptfs_validate_marker(char *data)
{
u32 m_1, m_2;
m_1 = get_unaligned_be32(data);
m_2 = get_unaligned_be32(data + 4);
if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
- return 1;
+ return 0;
ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
"MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
MAGIC_ECRYPTFS_MARKER);
ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
"[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
- return 0;
+ return -EINVAL;
}
struct ecryptfs_flag_map_elem {
@@ -1201,27 +1201,19 @@ int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
return rc;
}
-int ecryptfs_read_and_validate_header_region(char *data,
- struct inode *ecryptfs_inode)
+int ecryptfs_read_and_validate_header_region(struct inode *inode)
{
- struct ecryptfs_crypt_stat *crypt_stat =
- &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+ u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+ u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
int rc;
- if (crypt_stat->extent_size == 0)
- crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
- rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
- ecryptfs_inode);
- if (rc < 0) {
- printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
- if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
- rc = -EINVAL;
- } else
- rc = 0;
-out:
+ rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
+ inode);
+ if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+ return rc >= 0 ? -EINVAL : rc;
+ rc = ecryptfs_validate_marker(marker);
+ if (!rc)
+ ecryptfs_i_size_init(file_size, inode);
return rc;
}
@@ -1242,8 +1234,7 @@ ecryptfs_write_header_metadata(char *virt,
(*written) = 6;
}
-struct kmem_cache *ecryptfs_header_cache_1;
-struct kmem_cache *ecryptfs_header_cache_2;
+struct kmem_cache *ecryptfs_header_cache;
/**
* ecryptfs_write_headers_virt
@@ -1496,11 +1487,9 @@ static int ecryptfs_read_headers_virt(char *page_virt,
crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
offset = ECRYPTFS_FILE_SIZE_BYTES;
- rc = contains_ecryptfs_marker(page_virt + offset);
- if (rc == 0) {
- rc = -EINVAL;
+ rc = ecryptfs_validate_marker(page_virt + offset);
+ if (rc)
goto out;
- }
if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
@@ -1567,20 +1556,21 @@ out:
return rc;
}
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
- struct dentry *ecryptfs_dentry)
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+ struct inode *inode)
{
+ u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+ u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
int rc;
- rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry->d_inode);
- if (rc)
- goto out;
- if (!contains_ecryptfs_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES)) {
- printk(KERN_WARNING "Valid data found in [%s] xattr, but "
- "the marker is invalid\n", ECRYPTFS_XATTR_NAME);
- rc = -EINVAL;
- }
-out:
+ rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+ ECRYPTFS_XATTR_NAME, file_size,
+ ECRYPTFS_SIZE_AND_MARKER_BYTES);
+ if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+ return rc >= 0 ? -EINVAL : rc;
+ rc = ecryptfs_validate_marker(marker);
+ if (!rc)
+ ecryptfs_i_size_init(file_size, inode);
return rc;
}
@@ -1610,7 +1600,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
mount_crypt_stat);
/* Read the first page from the underlying file */
- page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER);
+ page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
if (!page_virt) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Unable to allocate page_virt\n",
@@ -1655,7 +1645,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
out:
if (page_virt) {
memset(page_virt, 0, PAGE_CACHE_SIZE);
- kmem_cache_free(ecryptfs_header_cache_1, page_virt);
+ kmem_cache_free(ecryptfs_header_cache, page_virt);
}
return rc;
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e70282775e2c..43c7c43b06f5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -200,6 +200,8 @@ ecryptfs_get_key_payload_data(struct key *key)
#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */
#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
+#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
+ + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
#define ECRYPTFS_DEFAULT_CIPHER "aes"
#define ECRYPTFS_DEFAULT_KEY_BYTES 16
#define ECRYPTFS_DEFAULT_HASH "md5"
@@ -603,8 +605,7 @@ extern struct kmem_cache *ecryptfs_file_info_cache;
extern struct kmem_cache *ecryptfs_dentry_info_cache;
extern struct kmem_cache *ecryptfs_inode_info_cache;
extern struct kmem_cache *ecryptfs_sb_info_cache;
-extern struct kmem_cache *ecryptfs_header_cache_1;
-extern struct kmem_cache *ecryptfs_header_cache_2;
+extern struct kmem_cache *ecryptfs_header_cache;
extern struct kmem_cache *ecryptfs_xattr_cache;
extern struct kmem_cache *ecryptfs_key_record_cache;
extern struct kmem_cache *ecryptfs_key_sig_cache;
@@ -625,14 +626,9 @@ struct ecryptfs_open_req {
struct list_head kthread_ctl_list;
};
-#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001
-int ecryptfs_interpose(struct dentry *hidden_dentry,
- struct dentry *this_dentry, struct super_block *sb,
- u32 flags);
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+ struct super_block *sb);
void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
- struct dentry *lower_dentry,
- struct inode *ecryptfs_dir_inode);
int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
size_t *decrypted_name_size,
struct dentry *ecryptfs_dentry,
@@ -664,10 +660,9 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
void ecryptfs_write_crypt_stat_flags(char *page_virt,
struct ecryptfs_crypt_stat *crypt_stat,
size_t *written);
-int ecryptfs_read_and_validate_header_region(char *data,
- struct inode *ecryptfs_inode);
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
- struct dentry *ecryptfs_dentry);
+int ecryptfs_read_and_validate_header_region(struct inode *inode);
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
+ struct inode *inode);
u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
@@ -679,9 +674,6 @@ int
ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
unsigned char *src, struct dentry *ecryptfs_dentry);
int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
ssize_t
ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
void *value, size_t size);
@@ -761,7 +753,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
struct dentry *lower_dentry,
struct vfsmount *lower_mnt,
const struct cred *cred);
-int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry);
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
void ecryptfs_put_lower_file(struct inode *inode);
int
ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 566e5472f78c..4ec9eb00a241 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,7 +191,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
| ECRYPTFS_ENCRYPTED);
}
mutex_unlock(&crypt_stat->cs_mutex);
- rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+ rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bc116b9ffcf2..7349ade17de6 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -51,6 +51,97 @@ static void unlock_dir(struct dentry *dir)
dput(dir);
}
+static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
+{
+ if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
+ return 1;
+ return 0;
+}
+
+static int ecryptfs_inode_set(struct inode *inode, void *opaque)
+{
+ struct inode *lower_inode = opaque;
+
+ ecryptfs_set_inode_lower(inode, lower_inode);
+ fsstack_copy_attr_all(inode, lower_inode);
+ /* i_size will be overwritten for encrypted regular files */
+ fsstack_copy_inode_size(inode, lower_inode);
+ inode->i_ino = lower_inode->i_ino;
+ inode->i_version++;
+ inode->i_mapping->a_ops = &ecryptfs_aops;
+
+ if (S_ISLNK(inode->i_mode))
+ inode->i_op = &ecryptfs_symlink_iops;
+ else if (S_ISDIR(inode->i_mode))
+ inode->i_op = &ecryptfs_dir_iops;
+ else
+ inode->i_op = &ecryptfs_main_iops;
+
+ if (S_ISDIR(inode->i_mode))
+ inode->i_fop = &ecryptfs_dir_fops;
+ else if (special_file(inode->i_mode))
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ else
+ inode->i_fop = &ecryptfs_main_fops;
+
+ return 0;
+}
+
+static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
+ struct super_block *sb)
+{
+ struct inode *inode;
+
+ if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
+ return ERR_PTR(-EXDEV);
+ if (!igrab(lower_inode))
+ return ERR_PTR(-ESTALE);
+ inode = iget5_locked(sb, (unsigned long)lower_inode,
+ ecryptfs_inode_test, ecryptfs_inode_set,
+ lower_inode);
+ if (!inode) {
+ iput(lower_inode);
+ return ERR_PTR(-EACCES);
+ }
+ if (!(inode->i_state & I_NEW))
+ iput(lower_inode);
+
+ return inode;
+}
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+ struct super_block *sb)
+{
+ struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
+
+ if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_interpose(struct dentry *lower_dentry,
+ struct dentry *dentry, struct super_block *sb)
+{
+ struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
/**
* ecryptfs_create_underlying_file
* @lower_dir_inode: inode of the parent in the lower fs of the new file
@@ -129,7 +220,7 @@ ecryptfs_do_create(struct inode *directory_inode,
goto out_lock;
}
rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
- directory_inode->i_sb, 0);
+ directory_inode->i_sb);
if (rc) {
ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
goto out_lock;
@@ -168,7 +259,8 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
"context; rc = [%d]\n", rc);
goto out;
}
- rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+ rc = ecryptfs_get_lower_file(ecryptfs_dentry,
+ ecryptfs_dentry->d_inode);
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
@@ -215,102 +307,90 @@ out:
return rc;
}
+static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
+{
+ struct ecryptfs_crypt_stat *crypt_stat;
+ int rc;
+
+ rc = ecryptfs_get_lower_file(dentry, inode);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the lower file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ dentry->d_name.name, rc);
+ return rc;
+ }
+
+ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+ /* TODO: lock for crypt_stat comparison */
+ if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+ ecryptfs_set_default_sizes(crypt_stat);
+
+ rc = ecryptfs_read_and_validate_header_region(inode);
+ ecryptfs_put_lower_file(inode);
+ if (rc) {
+ rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
+ if (!rc)
+ crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+ }
+
+ /* Must return 0 to allow non-eCryptfs files to be looked up, too */
+ return 0;
+}
+
/**
- * ecryptfs_lookup_and_interpose_lower - Perform a lookup
+ * ecryptfs_lookup_interpose - Dentry interposition for a lookup
*/
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
- struct dentry *lower_dentry,
- struct inode *ecryptfs_dir_inode)
+static int ecryptfs_lookup_interpose(struct dentry *dentry,
+ struct dentry *lower_dentry,
+ struct inode *dir_inode)
{
- struct dentry *lower_dir_dentry;
+ struct inode *inode, *lower_inode = lower_dentry->d_inode;
+ struct ecryptfs_dentry_info *dentry_info;
struct vfsmount *lower_mnt;
- struct inode *lower_inode;
- struct ecryptfs_crypt_stat *crypt_stat;
- char *page_virt = NULL;
- int put_lower = 0, rc = 0;
-
- lower_dir_dentry = lower_dentry->d_parent;
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
- ecryptfs_dentry->d_parent));
- lower_inode = lower_dentry->d_inode;
- fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
+ int rc = 0;
+
+ lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+ fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
BUG_ON(!lower_dentry->d_count);
- ecryptfs_set_dentry_private(ecryptfs_dentry,
- kmem_cache_alloc(ecryptfs_dentry_info_cache,
- GFP_KERNEL));
- if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
- rc = -ENOMEM;
+
+ dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+ ecryptfs_set_dentry_private(dentry, dentry_info);
+ if (!dentry_info) {
printk(KERN_ERR "%s: Out of memory whilst attempting "
"to allocate ecryptfs_dentry_info struct\n",
__func__);
- goto out_put;
+ dput(lower_dentry);
+ mntput(lower_mnt);
+ d_drop(dentry);
+ return -ENOMEM;
}
- ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
- ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
+ ecryptfs_set_dentry_lower(dentry, lower_dentry);
+ ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+
if (!lower_dentry->d_inode) {
/* We want to add because we couldn't find in lower */
- d_add(ecryptfs_dentry, NULL);
- goto out;
- }
- rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
- ecryptfs_dir_inode->i_sb,
- ECRYPTFS_INTERPOSE_FLAG_D_ADD);
- if (rc) {
- printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
- if (S_ISDIR(lower_inode->i_mode))
- goto out;
- if (S_ISLNK(lower_inode->i_mode))
- goto out;
- if (special_file(lower_inode->i_mode))
- goto out;
- /* Released in this function */
- page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
- if (!page_virt) {
- printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
- __func__);
- rc = -ENOMEM;
- goto out;
+ d_add(dentry, NULL);
+ return 0;
}
- rc = ecryptfs_get_lower_file(ecryptfs_dentry);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to initialize "
- "the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
- goto out_free_kmem;
+ inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
+ if (IS_ERR(inode)) {
+ printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
+ __func__, PTR_ERR(inode));
+ return PTR_ERR(inode);
}
- put_lower = 1;
- crypt_stat = &ecryptfs_inode_to_private(
- ecryptfs_dentry->d_inode)->crypt_stat;
- /* TODO: lock for crypt_stat comparison */
- if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
- ecryptfs_set_default_sizes(crypt_stat);
- rc = ecryptfs_read_and_validate_header_region(page_virt,
- ecryptfs_dentry->d_inode);
- if (rc) {
- memset(page_virt, 0, PAGE_CACHE_SIZE);
- rc = ecryptfs_read_and_validate_xattr_region(page_virt,
- ecryptfs_dentry);
+ if (S_ISREG(inode->i_mode)) {
+ rc = ecryptfs_i_size_read(dentry, inode);
if (rc) {
- rc = 0;
- goto out_free_kmem;
+ make_bad_inode(inode);
+ return rc;
}
- crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
}
- ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
-out_free_kmem:
- kmem_cache_free(ecryptfs_header_cache_2, page_virt);
- goto out;
-out_put:
- dput(lower_dentry);
- mntput(lower_mnt);
- d_drop(ecryptfs_dentry);
-out:
- if (put_lower)
- ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+ d_add(dentry, inode);
+
return rc;
}
@@ -353,12 +433,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
goto out_d_drop;
}
if (lower_dentry->d_inode)
- goto lookup_and_interpose;
+ goto interpose;
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
if (!(mount_crypt_stat
&& (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
- goto lookup_and_interpose;
+ goto interpose;
dput(lower_dentry);
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
@@ -381,9 +461,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
encrypted_and_encoded_name);
goto out_d_drop;
}
-lookup_and_interpose:
- rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
- ecryptfs_dir_inode);
+interpose:
+ rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
+ ecryptfs_dir_inode);
goto out;
out_d_drop:
d_drop(ecryptfs_dentry);
@@ -411,7 +491,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
lower_new_dentry);
if (rc || !lower_new_dentry->d_inode)
goto out_lock;
- rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
if (rc)
goto out_lock;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -478,7 +558,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
kfree(encoded_symname);
if (rc || !lower_dentry->d_inode)
goto out_lock;
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out_lock;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -502,7 +582,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
if (rc || !lower_dentry->d_inode)
goto out;
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -521,8 +601,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
struct dentry *lower_dir_dentry;
int rc;
- dentry_unhash(dentry);
-
lower_dentry = ecryptfs_dentry_to_lower(dentry);
dget(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
@@ -552,7 +630,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
if (rc || !lower_dentry->d_inode)
goto out;
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -575,9 +653,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dir_dentry;
struct dentry *trap = NULL;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
dget(lower_old_dentry);
@@ -755,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
lower_ia->ia_valid &= ~ATTR_SIZE;
return 0;
}
- rc = ecryptfs_get_lower_file(dentry);
+ rc = ecryptfs_get_lower_file(dentry, inode);
if (rc)
return rc;
crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
@@ -911,7 +986,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
mount_crypt_stat = &ecryptfs_superblock_to_private(
dentry->d_sb)->mount_crypt_stat;
- rc = ecryptfs_get_lower_file(dentry);
+ rc = ecryptfs_get_lower_file(dentry, inode);
if (rc) {
mutex_unlock(&crypt_stat->cs_mutex);
goto out;
@@ -1084,21 +1159,6 @@ out:
return rc;
}
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
-{
- if ((ecryptfs_inode_to_lower(inode)
- == (struct inode *)candidate_lower_inode))
- return 1;
- else
- return 0;
-}
-
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
-{
- ecryptfs_init_inode(inode, (struct inode *)lower_inode);
- return 0;
-}
-
const struct inode_operations ecryptfs_symlink_iops = {
.readlink = ecryptfs_readlink,
.follow_link = ecryptfs_follow_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 89b93389af8e..9f1bb747d77d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -135,12 +135,12 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
return rc;
}
-int ecryptfs_get_lower_file(struct dentry *dentry)
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
{
- struct ecryptfs_inode_info *inode_info =
- ecryptfs_inode_to_private(dentry->d_inode);
+ struct ecryptfs_inode_info *inode_info;
int count, rc = 0;
+ inode_info = ecryptfs_inode_to_private(inode);
mutex_lock(&inode_info->lower_file_mutex);
count = atomic_inc_return(&inode_info->lower_file_count);
if (WARN_ON_ONCE(count < 1))
@@ -168,75 +168,6 @@ void ecryptfs_put_lower_file(struct inode *inode)
}
}
-static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
- struct super_block *sb)
-{
- struct inode *inode;
- int rc = 0;
-
- if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
- rc = -EXDEV;
- goto out;
- }
- if (!igrab(lower_inode)) {
- rc = -ESTALE;
- goto out;
- }
- inode = iget5_locked(sb, (unsigned long)lower_inode,
- ecryptfs_inode_test, ecryptfs_inode_set,
- lower_inode);
- if (!inode) {
- rc = -EACCES;
- iput(lower_inode);
- goto out;
- }
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
- else
- iput(lower_inode);
- if (S_ISLNK(lower_inode->i_mode))
- inode->i_op = &ecryptfs_symlink_iops;
- else if (S_ISDIR(lower_inode->i_mode))
- inode->i_op = &ecryptfs_dir_iops;
- if (S_ISDIR(lower_inode->i_mode))
- inode->i_fop = &ecryptfs_dir_fops;
- if (special_file(lower_inode->i_mode))
- init_special_inode(inode, lower_inode->i_mode,
- lower_inode->i_rdev);
- fsstack_copy_attr_all(inode, lower_inode);
- /* This size will be overwritten for real files w/ headers and
- * other metadata */
- fsstack_copy_inode_size(inode, lower_inode);
- return inode;
-out:
- return ERR_PTR(rc);
-}
-
-/**
- * ecryptfs_interpose
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
- struct super_block *sb, u32 flags)
-{
- struct inode *lower_inode = lower_dentry->d_inode;
- struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
- return 0;
-}
-
enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
ecryptfs_opt_ecryptfs_key_bytes,
@@ -704,13 +635,8 @@ static struct ecryptfs_cache_info {
.size = sizeof(struct ecryptfs_sb_info),
},
{
- .cache = &ecryptfs_header_cache_1,
- .name = "ecryptfs_headers_1",
- .size = PAGE_CACHE_SIZE,
- },
- {
- .cache = &ecryptfs_header_cache_2,
- .name = "ecryptfs_headers_2",
+ .cache = &ecryptfs_header_cache,
+ .name = "ecryptfs_headers",
.size = PAGE_CACHE_SIZE,
},
{
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 245b517bf1b6..dbd52d40df4c 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -93,22 +93,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
}
/**
- * ecryptfs_init_inode
- * @inode: The ecryptfs inode
- *
- * Set up the ecryptfs inode.
- */
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
-{
- ecryptfs_set_inode_lower(inode, lower_inode);
- inode->i_ino = lower_inode->i_ino;
- inode->i_version++;
- inode->i_op = &ecryptfs_main_iops;
- inode->i_fop = &ecryptfs_main_fops;
- inode->i_mapping->a_ops = &ecryptfs_aops;
-}
-
-/**
* ecryptfs_statfs
* @sb: The ecryptfs super block
* @buf: The struct kstatfs to fill in with stats
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 68b2e43d7c35..3451d23c3bae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3392,7 +3392,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext3_dirty_inode(struct inode *inode)
+void ext3_dirty_inode(struct inode *inode, int flags)
{
handle_t *current_handle = ext3_journal_current_handle();
handle_t *handle;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a74b89c09f90..1921392cd708 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1813,7 +1813,7 @@ extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_dirty_inode(struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50d0e9c64584..a5763e3505ba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5733,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*/
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index be15437c272e..3b222dafd15b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
- dentry_unhash(dentry);
-
lock_super(sb);
/*
* Check whether the directory is not in use, then check
@@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
err = fat_scan(old_dir, old_name, &old_sinfo);
if (err) {
err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c61a6789f36c..20b4ea53fdc4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
- dentry_unhash(dentry);
-
lock_super(sb);
err = fat_dir_empty(inode);
@@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
int err, is_dir, update_dotdot, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34591ee804b5..0f015a0468de 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1007,9 +1007,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* In short, make sure you hash any inodes _before_ you start marking
* them dirty.
*
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
* the kernel-internal blockdev inode represents the dirtying time of the
@@ -1028,7 +1025,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode);
+ sb->s_op->dirty_inode(inode, flags);
}
/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d0e3faddcfa..d50160714595 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
if (IS_ERR(req))
return PTR_ERR(req);
- dentry_unhash(entry);
-
req->in.h.opcode = FUSE_RMDIR;
req->in.h.nodeid = get_node_id(dir);
req->in.numargs = 1;
@@ -694,9 +692,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
struct fuse_conn *fc = get_fuse_conn(olddir);
struct fuse_req *req = fuse_get_req(fc);
- if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
- dentry_unhash(newent);
-
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 1cb70cdba2c1..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
- if (S_ISDIR(inode->i_mode))
- dentry_unhash(dentry);
-
if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
return -ENOTEMPTY;
res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
res = hfs_remove(new_dir, new_dentry);
if (res)
return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b28835091dd0..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
- dentry_unhash(dentry);
-
if (inode->i_size != 2)
return -ENOTEMPTY;
@@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode)) {
- dentry_unhash(new_dentry);
+ if (S_ISDIR(new_dentry->d_inode->i_mode))
res = hfsplus_rmdir(new_dir, new_dentry);
- } else {
+ else
res = hfsplus_unlink(new_dir, new_dentry);
- }
if (res)
return res;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6816b9e6903..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
char *file;
int err;
- dentry_unhash(dentry);
-
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = do_rmdir(file);
@@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
char *from_name, *to_name;
int err;
- if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
- dentry_unhash(to);
-
if ((from_name = dentry_name(from)) == NULL)
return -ENOMEM;
if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ff0ce21c0867..acf95dab2aac 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -439,8 +439,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
int err;
int r;
- dentry_unhash(dentry);
-
hpfs_adjust_length(name, &len);
hpfs_lock(dir->i_sb);
err = -ENOENT;
@@ -535,9 +533,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct fnode *fnode;
int err;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
if ((err = hpfs_chk_name(new_name, &new_len))) return err;
err = 0;
hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/inode.c b/fs/inode.c
index 990d284877a1..0f7e88a7803f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1,9 +1,7 @@
/*
- * linux/fs/inode.c
- *
* (C) 1997 Linus Torvalds
+ * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
*/
-
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/dcache.h>
@@ -27,10 +25,11 @@
#include <linux/prefetch.h>
#include <linux/ima.h>
#include <linux/cred.h>
+#include <linux/buffer_head.h> /* for inode_has_buffers */
#include "internal.h"
/*
- * inode locking rules.
+ * Inode locking rules:
*
* inode->i_lock protects:
* inode->i_state, inode->i_hash, __iget()
@@ -60,54 +59,11 @@
* inode_hash_lock
*/
-/*
- * This is needed for the following functions:
- * - inode_has_buffers
- * - invalidate_bdev
- *
- * FIXME: remove all knowledge of the buffer layer from this file
- */
-#include <linux/buffer_head.h>
-
-/*
- * New inode.c implementation.
- *
- * This implementation has the basic premise of trying
- * to be extremely low-overhead and SMP-safe, yet be
- * simple enough to be "obviously correct".
- *
- * Famous last words.
- */
-
-/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
-
-/* #define INODE_PARANOIA 1 */
-/* #define INODE_DEBUG 1 */
-
-/*
- * Inode lookup is no longer as critical as it used to be:
- * most of the lookups are going to be through the dcache.
- */
-#define I_HASHBITS i_hash_shift
-#define I_HASHMASK i_hash_mask
-
static unsigned int i_hash_mask __read_mostly;
static unsigned int i_hash_shift __read_mostly;
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
-/*
- * Each inode can be on two separate lists. One is
- * the hash list of the inode, used for lookups. The
- * other linked list is the "type" list:
- * "in_use" - valid inode, i_count > 0, i_nlink > 0
- * "dirty" - as "in_use" but also dirty
- * "unused" - valid inode, i_count = 0
- *
- * A "dirty" list is maintained for each super block,
- * allowing for low-overhead inode sync() operations.
- */
-
static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);
@@ -424,8 +380,8 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
L1_CACHE_BYTES;
- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
- return tmp & I_HASHMASK;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+ return tmp & i_hash_mask;
}
/**
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9a1e86fc1362..4bca6a2e5c07 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -605,8 +605,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
int ret;
uint32_t now = get_seconds();
- dentry_unhash(dentry);
-
for (fd = f->dents ; fd; fd = fd->next) {
if (fd->ino)
return -ENOTEMPTY;
@@ -782,9 +780,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
uint8_t type;
uint32_t now;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* The VFS will check for us and prevent trying to rename a
* file over a directory and vice versa, but if it's a directory,
* the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e896e67767eb..46ad619b6124 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -357,7 +357,7 @@ error:
return ERR_PTR(ret);
}
-void jffs2_dirty_inode(struct inode *inode)
+void jffs2_dirty_inode(struct inode *inode, int flags)
{
struct iattr iattr;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 00bae7cc2e48..65c6c43ca482 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -172,7 +172,7 @@ int jffs2_setattr (struct dentry *, struct iattr *);
int jffs2_do_setattr (struct inode *, struct iattr *);
struct inode *jffs2_iget(struct super_block *, unsigned long);
void jffs2_evict_inode (struct inode *);
-void jffs2_dirty_inode(struct inode *inode);
+void jffs2_dirty_inode(struct inode *inode, int flags);
struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
struct jffs2_raw_inode *ri);
int jffs2_statfs (struct dentry *, struct kstatfs *);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index eddbb373209e..109655904bbc 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -173,7 +173,7 @@ void jfs_evict_inode(struct inode *inode)
dquot_drop(inode);
}
-void jfs_dirty_inode(struct inode *inode)
+void jfs_dirty_inode(struct inode *inode, int flags)
{
static int noisy = 5;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 155e91eff07d..ec2fb8b945fc 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -28,7 +28,7 @@ extern struct inode *jfs_iget(struct super_block *, unsigned long);
extern int jfs_commit_inode(struct inode *, int);
extern int jfs_write_inode(struct inode *, struct writeback_control *);
extern void jfs_evict_inode(struct inode *);
-extern void jfs_dirty_inode(struct inode *);
+extern void jfs_dirty_inode(struct inode *, int);
extern void jfs_truncate(struct inode *);
extern void jfs_truncate_nolock(struct inode *, loff_t);
extern void jfs_free_zero_link(struct inode *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 865df16a6cf3..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
- dentry_unhash(dentry);
-
/* Init inode for quota operations. */
dquot_initialize(dip);
dquot_initialize(ip);
@@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
new_dentry->d_name.name);
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
dquot_initialize(old_dir);
dquot_initialize(new_dir);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f34c9cde9e94..9ed89d1663f8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- dentry_unhash(dentry);
-
if (!logfs_empty_dir(inode))
return -ENOTEMPTY;
@@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
loff_t pos;
int err;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* 1. locate source dd */
err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f60aed8db9c4..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err = -ENOTEMPTY;
- dentry_unhash(dentry);
-
if (minix_empty_dir(inode)) {
err = minix_unlink(dir, dentry);
if (!err) {
@@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
struct minix_dir_entry * old_de;
int err = -ENOENT;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_de = minix_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/namei.c b/fs/namei.c
index 2358b326b221..e2e4e8d032ee 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -919,12 +919,11 @@ static inline bool managed_dentry_might_block(struct dentry *dentry)
}
/*
- * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
- * meet a managed dentry and we're not walking to "..". True is returned to
- * continue, false to abort.
+ * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
+ * we meet a managed dentry that would need blocking.
*/
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
- struct inode **inode, bool reverse_transit)
+ struct inode **inode)
{
for (;;) {
struct vfsmount *mounted;
@@ -933,8 +932,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
* that wants to block transit.
*/
*inode = path->dentry->d_inode;
- if (!reverse_transit &&
- unlikely(managed_dentry_might_block(path->dentry)))
+ if (unlikely(managed_dentry_might_block(path->dentry)))
return false;
if (!d_mountpoint(path->dentry))
@@ -947,16 +945,24 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
path->dentry = mounted->mnt_root;
nd->seq = read_seqcount_begin(&path->dentry->d_seq);
}
-
- if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
- return reverse_transit;
return true;
}
-static int follow_dotdot_rcu(struct nameidata *nd)
+static void follow_mount_rcu(struct nameidata *nd)
{
- struct inode *inode = nd->inode;
+ while (d_mountpoint(nd->path.dentry)) {
+ struct vfsmount *mounted;
+ mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+ if (!mounted)
+ break;
+ nd->path.mnt = mounted;
+ nd->path.dentry = mounted->mnt_root;
+ nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+ }
+}
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
set_root_rcu(nd);
while (1) {
@@ -972,7 +978,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
seq = read_seqcount_begin(&parent->d_seq);
if (read_seqcount_retry(&old->d_seq, nd->seq))
goto failed;
- inode = parent->d_inode;
nd->path.dentry = parent;
nd->seq = seq;
break;
@@ -980,10 +985,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
if (!follow_up_rcu(&nd->path))
break;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- inode = nd->path.dentry->d_inode;
}
- __follow_mount_rcu(nd, &nd->path, &inode, true);
- nd->inode = inode;
+ follow_mount_rcu(nd);
+ nd->inode = nd->path.dentry->d_inode;
return 0;
failed:
@@ -1157,8 +1161,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
}
path->mnt = mnt;
path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode, false)))
- return 0;
+ if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+ goto unlazy;
+ if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+ goto unlazy;
+ return 0;
unlazy:
if (unlazy_walk(nd, dentry))
return -ECHILD;
@@ -2572,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (error)
goto out;
+ shrink_dcache_parent(dentry);
error = dir->i_op->rmdir(dir, dentry);
if (error)
goto out;
@@ -2986,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
goto out;
+ if (target)
+ shrink_dcache_parent(new_dentry);
error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
if (error)
goto out;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e3e646b06404..9c51f621e901 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,8 +1033,11 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
DPRINTK("ncp_rmdir: removing %s/%s\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
+ /*
+ * fail with EBUSY if there are still references to this
+ * directory.
+ */
dentry_unhash(dentry);
-
error = -EBUSY;
if (!d_unhashed(dentry))
goto out;
@@ -1141,8 +1144,16 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
+ /*
+ * fail with EBUSY if there are still references to this
+ * directory.
+ */
dentry_unhash(new_dentry);
+ error = -EBUSY;
+ if (!d_unhashed(new_dentry))
+ goto out;
+ }
ncp_age_dentry(server, old_dentry);
ncp_age_dentry(server, new_dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba306658a6db..81515545ba75 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
config PNFS_FILE_LAYOUT
tristate
+config PNFS_OBJLAYOUT
+ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+ help
+ Say M here if you want your pNFS client to support the Objects Layout Driver.
+ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+ upper level driver (SCSI_OSD_ULD).
+
+ If unsure, say N.
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9e3814..6a34f7dd0e6f 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
delegation.o idmap.o \
callback.o callback_xdr.o callback_proc.o \
nfs4namespace.o
-nfs-$(CONFIG_NFS_V4_1) += pnfs.o
+nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce7311b..b257383bb565 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
extern void nfs4_cb_take_slot(struct nfs_client *clp);
+
+struct cb_devicenotifyitem {
+ uint32_t cbd_notify_type;
+ uint32_t cbd_layout_type;
+ struct nfs4_deviceid cbd_dev_id;
+ uint32_t cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+ int ndevs;
+ struct cb_devicenotifyitem *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(
+ struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps);
+
#endif /* CONFIG_NFS_V4_1 */
extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dccea18e..d4d1954e9bb9 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
mark_matching_lsegs_invalid(lo, &free_me_list,
- args->cbl_range.iomode))
+ &args->cbl_range))
rv = NFS4ERR_DELAY;
else
rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+ if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
rv = NFS4ERR_DELAY;
list_del_init(&lo->plh_bulk_recall);
spin_unlock(&ino->i_lock);
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
do_callback_layoutrecall(clp, &args);
}
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ int i;
+ __be32 res = 0;
+ struct nfs_client *clp = cps->clp;
+ struct nfs_server *server = NULL;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (!clp) {
+ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ goto out;
+ }
+
+ for (i = 0; i < args->ndevs; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ if (!server ||
+ server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ if (server->pnfs_curr_ld &&
+ server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+ rcu_read_unlock();
+ goto found;
+ }
+ rcu_read_unlock();
+ dprintk("%s: layout type %u not found\n",
+ __func__, dev->cbd_layout_type);
+ continue;
+ }
+
+ found:
+ if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+ dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+ "deleting instead\n", __func__);
+ nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+ }
+
+out:
+ kfree(args->devs);
+ dprintk("%s: exit with status = %u\n",
+ __func__, be32_to_cpu(res));
+ return res;
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62ce7c1..c6c86a77e043 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@
#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
return status;
}
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_devicenotifyargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ u32 tmp;
+ int n, i;
+ args->ndevs = 0;
+
+ /* Num of device notifications */
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ n = ntohl(*p++);
+ if (n <= 0)
+ goto out;
+
+ args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+ if (!args->devs) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+
+ /* Decode each dev notification */
+ for (i = 0; i < n; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* bitmap size */
+ if (tmp != 1) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_notify_type = ntohl(*p++);
+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* opaque size */
+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_layout_type = ntohl(*p++);
+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+ dev->cbd_immediate = ntohl(*p++);
+ } else {
+ dev->cbd_immediate = 0;
+ }
+
+ args->ndevs++;
+
+ dprintk("%s: type %d layout 0x%x immediate %d\n",
+ __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+ dev->cbd_immediate);
+ }
+out:
+ dprintk("%s: status %d ndevs %d\n",
+ __func__, ntohl(status), args->ndevs);
+ return status;
+err:
+ kfree(args->devs);
+ goto out;
+}
+
static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT:
case OP_CB_LAYOUTRECALL:
+ case OP_CB_NOTIFY_DEVICEID:
*op = &callback_ops[op_nr];
break;
- case OP_CB_NOTIFY_DEVICEID:
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
(callback_decode_arg_t)decode_layoutrecall_args,
.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
},
+ [OP_CB_NOTIFY_DEVICEID] = {
+ .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+ .decode_args =
+ (callback_decode_arg_t)decode_devicenotify_args,
+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+ },
[OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 139be9647d80..b3dc2b88b65b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
if (clp->cl_machine_cred != NULL)
put_rpccred(clp->cl_machine_cred);
+ nfs4_deviceid_purge_client(clp);
+
kfree(clp->cl_hostname);
kfree(clp);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index bbbc6bf5cb2e..dd25c2aec375 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -21,25 +21,13 @@
#include "delegation.h"
#include "internal.h"
-static void nfs_do_free_delegation(struct nfs_delegation *delegation)
-{
- kfree(delegation);
-}
-
-static void nfs_free_delegation_callback(struct rcu_head *head)
-{
- struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
-
- nfs_do_free_delegation(delegation);
-}
-
static void nfs_free_delegation(struct nfs_delegation *delegation)
{
if (delegation->cred) {
put_rpccred(delegation->cred);
delegation->cred = NULL;
}
- call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+ kfree_rcu(delegation, rcu);
}
/**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 424e47773a84..ededdbd0db38 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
struct page **xdr_pages, struct page *page, unsigned int buflen)
{
struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = xdr_pages,
- .page_len = buflen,
- .buflen = buflen,
- .len = buflen,
- };
+ struct xdr_buf buf;
struct page *scratch;
struct nfs_cache_array *array;
unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (scratch == NULL)
return -ENOMEM;
- xdr_init_decode(&stream, &buf, NULL);
+ xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
do {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31ad7a5e..144f2a3c7185 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1298,8 +1298,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
- dprintk("NFS: isize change on server for file %s/%ld\n",
- inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: isize change on server for file %s/%ld "
+ "(%Ld to %Ld)\n",
+ inode->i_sb->s_id,
+ inode->i_ino,
+ (long long)cur_isize,
+ (long long)new_isize);
}
} else
invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1424,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
void nfs4_evict_inode(struct inode *inode)
{
- pnfs_destroy_layout(NFS_I(inode));
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
+ pnfs_return_layout(inode);
+ pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2df6ca7b5898..b9056cbe68d6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
#endif
/* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
extern int nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be79dc9f386d..426908809c97 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
struct nfs4_deviceid *id,
gfp_t gfp_flags)
{
+ struct nfs4_deviceid_node *d;
struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL;
struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
if (fl->pattern_offset > lgr->range.offset) {
- dprintk("%s pattern_offset %lld to large\n",
+ dprintk("%s pattern_offset %lld too large\n",
__func__, fl->pattern_offset);
goto out;
}
@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
}
/* find and reference the deviceid */
- dsaddr = nfs4_fl_find_get_deviceid(id);
- if (dsaddr == NULL) {
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
+ NFS_SERVER(lo->plh_inode)->nfs_client, id);
+ if (d == NULL) {
dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
if (dsaddr == NULL)
goto out;
- }
+ } else
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
fl->dsaddr = dsaddr;
if (fl->first_stripe_index < 0 ||
@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
gfp_t gfp_flags)
{
struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = lgr->layoutp->pages,
- .page_len = lgr->layoutp->len,
- .buflen = lgr->layoutp->len,
- .len = lgr->layoutp->len,
- };
+ struct xdr_buf buf;
struct page *scratch;
__be32 *p;
uint32_t nfl_util;
@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
if (!scratch)
return -ENOMEM;
- xdr_init_decode(&stream, &buf, NULL);
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
memcpy(id, p, sizeof(*id));
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
- print_deviceid(id);
+ nfs4_print_deviceid(id);
nfl_util = be32_to_cpup(p++);
if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
- * return 1 : coalesce page
- * return 0 : don't coalesce page
+ * return true : coalesce page
+ * return false : don't coalesce page
*/
-int
+bool
filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
u64 p_stripe, r_stripe;
u32 stripe_unit;
+ if (!pnfs_generic_pg_test(pgio, prev, req))
+ return 0;
+
if (!pgio->pg_lseg)
return 1;
p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
return -ENOMEM;
}
+static void
+filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
+ .free_deviceid_node = filelayout_free_deveiceid_node,
};
static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2b461d77b43a..cebe01e3795e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
struct nfs4_file_layout_dsaddr {
- struct hlist_node node;
- struct nfs4_deviceid deviceid;
- atomic_t ref;
+ struct nfs4_deviceid_node id_node;
unsigned long flags;
u32 stripe_count;
u8 *stripe_indices;
@@ -95,14 +93,12 @@ extern struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
extern void print_ds(struct nfs4_pnfs_ds *ds);
-extern void print_deviceid(struct nfs4_deviceid *dev_id);
u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx);
-extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index db07c7af1395..3b7bf1377264 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,30 +37,6 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_FL_DEVICE_ID_HASH_BITS 5
-#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
-#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
-{
- unsigned char *cptr = (unsigned char *)id->data;
- unsigned int nbytes = NFS4_DEVICEID4_SIZE;
- u32 x = 0;
-
- while (nbytes--) {
- x *= 37;
- x += *cptr++;
- }
- return x & NFS4_FL_DEVICE_ID_HASH_MASK;
-}
-
-static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
-static DEFINE_SPINLOCK(filelayout_deviceid_lock);
-
-/*
* Data server cache
*
* Data servers can be mapped to different device ids.
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
}
-void
-print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
-{
- int i;
-
- ifdebug(FACILITY) {
- printk("%s dsaddr->ds_num %d\n", __func__,
- dsaddr->ds_num);
- for (i = 0; i < dsaddr->ds_num; i++)
- print_ds(dsaddr->ds_list[i]);
- }
-}
-
-void print_deviceid(struct nfs4_deviceid *id)
-{
- u32 *p = (u32 *)id;
-
- dprintk("%s: device id= [%x%x%x%x]\n", __func__,
- p[0], p[1], p[2], p[3]);
-}
-
/* nfs4_ds_cache_lock is held */
static struct nfs4_pnfs_ds *
_data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
kfree(ds);
}
-static void
+void
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
struct nfs4_pnfs_ds *ds;
int i;
- print_deviceid(&dsaddr->deviceid);
+ nfs4_print_deviceid(&dsaddr->id_node.deviceid);
for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
u8 max_stripe_index;
struct nfs4_file_layout_dsaddr *dsaddr = NULL;
struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = pdev->pages,
- .page_len = pdev->pglen,
- .buflen = pdev->pglen,
- .len = pdev->pglen,
- };
+ struct xdr_buf buf;
struct page *scratch;
/* set up xdr stream */
@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
if (!scratch)
goto out_err;
- xdr_init_decode(&stream, &buf, NULL);
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
/* Get the stripe count (number of stripe index) */
@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
dsaddr->stripe_indices = stripe_indices;
stripe_indices = NULL;
dsaddr->ds_num = num;
-
- memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
+ nfs4_init_deviceid_node(&dsaddr->id_node,
+ NFS_SERVER(ino)->pnfs_curr_ld,
+ NFS_SERVER(ino)->nfs_client,
+ &pdev->dev_id);
for (i = 0; i < dsaddr->ds_num; i++) {
int j;
@@ -505,8 +457,8 @@ out_err:
static struct nfs4_file_layout_dsaddr *
decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
{
- struct nfs4_file_layout_dsaddr *d, *new;
- long hash;
+ struct nfs4_deviceid_node *d;
+ struct nfs4_file_layout_dsaddr *n, *new;
new = decode_device(inode, dev, gfp_flags);
if (!new) {
@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
return NULL;
}
- spin_lock(&filelayout_deviceid_lock);
- d = nfs4_fl_find_get_deviceid(&new->deviceid);
- if (d) {
- spin_unlock(&filelayout_deviceid_lock);
+ d = nfs4_insert_deviceid_node(&new->id_node);
+ n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ if (n != new) {
nfs4_fl_free_deviceid(new);
- return d;
+ return n;
}
- INIT_HLIST_NODE(&new->node);
- atomic_set(&new->ref, 1);
- hash = nfs4_fl_deviceid_hash(&new->deviceid);
- hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
- spin_unlock(&filelayout_deviceid_lock);
-
return new;
}
@@ -600,35 +545,7 @@ out_free:
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
- if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
- hlist_del_rcu(&dsaddr->node);
- spin_unlock(&filelayout_deviceid_lock);
-
- synchronize_rcu();
- nfs4_fl_free_deviceid(dsaddr);
- }
-}
-
-struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
-{
- struct nfs4_file_layout_dsaddr *d;
- struct hlist_node *n;
- long hash = nfs4_fl_deviceid_hash(id);
-
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
- if (!memcmp(&d->deviceid, id, sizeof(*id))) {
- if (!atomic_inc_not_zero(&d->ref))
- goto fail;
- rcu_read_unlock();
- return d;
- }
- }
-fail:
- rcu_read_unlock();
- return NULL;
+ nfs4_put_deviceid_node(&dsaddr->id_node);
}
/*
@@ -676,15 +593,15 @@ static void
filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
int err, u32 ds_addr)
{
- u32 *p = (u32 *)&dsaddr->deviceid;
+ u32 *p = (u32 *)&dsaddr->id_node.deviceid;
printk(KERN_ERR "NFS: data server %x connection error %d."
" Deviceid [%x%x%x%x] marked out of use.\n",
ds_addr, err, p[0], p[1], p[2], p[3]);
- spin_lock(&filelayout_deviceid_lock);
+ spin_lock(&nfs4_ds_cache_lock);
dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
- spin_unlock(&filelayout_deviceid_lock);
+ spin_unlock(&nfs4_ds_cache_lock);
}
struct nfs4_pnfs_ds *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf1b339c3937..d2c4b59c896d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -267,9 +267,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
break;
nfs4_schedule_stateid_recovery(server, state);
goto wait_on_recovery;
+ case -NFS4ERR_EXPIRED:
+ if (state != NULL)
+ nfs4_schedule_stateid_recovery(server, state);
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_EXPIRED:
nfs4_schedule_lease_recovery(clp);
goto wait_on_recovery;
#if defined(CONFIG_NFS_V4_1)
@@ -2361,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_state *state = NULL;
int status;
+ if (pnfs_ld_layoutret_on_setattr(inode))
+ pnfs_return_layout(inode);
+
nfs_fattr_init(fattr);
/* Search for an existing open(O_WRITE) file */
@@ -3175,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return err;
}
+void __nfs4_read_done_cb(struct nfs_read_data *data)
+{
+ nfs_invalidate_atime(data->inode);
+}
+
static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3184,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
return -EAGAIN;
}
- nfs_invalidate_atime(data->inode);
+ __nfs4_read_done_cb(data);
if (task->tk_status > 0)
renew_lease(server, data->timestamp);
return 0;
@@ -3198,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
- return data->read_done_cb(task, data);
+ return data->read_done_cb ? data->read_done_cb(task, data) :
+ nfs4_read_done_cb(task, data);
}
static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3243,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
- return data->write_done_cb(task, data);
+ return data->write_done_cb ? data->write_done_cb(task, data) :
+ nfs4_write_done_cb(task, data);
}
/* Reset the the nfs_write_data to send the write to the MDS. */
@@ -3670,9 +3682,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
break;
nfs4_schedule_stateid_recovery(server, state);
goto wait_on_recovery;
+ case -NFS4ERR_EXPIRED:
+ if (state != NULL)
+ nfs4_schedule_stateid_recovery(server, state);
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_EXPIRED:
nfs4_schedule_lease_recovery(clp);
goto wait_on_recovery;
#if defined(CONFIG_NFS_V4_1)
@@ -4543,6 +4557,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
case -ESTALE:
goto out;
case -NFS4ERR_EXPIRED:
+ nfs4_schedule_stateid_recovery(server, state);
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
nfs4_schedule_lease_recovery(server->nfs_client);
@@ -5666,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
return status;
}
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+ &lrp->res.seq_res, 0, task))
+ return;
+ rpc_call_start(task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct nfs_server *server;
+
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+ return;
+
+ server = NFS_SERVER(lrp->args.inode);
+ if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ nfs_restart_rpc(task, lrp->clp);
+ return;
+ }
+ if (task->tk_status == 0) {
+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
+ if (lrp->res.lrs_present) {
+ spin_lock(&lo->plh_inode->i_lock);
+ pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+ spin_unlock(&lo->plh_inode->i_lock);
+ } else
+ BUG_ON(!list_empty(&lo->plh_segs));
+ }
+ dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ put_layout_hdr(NFS_I(lrp->args.inode)->layout);
+ kfree(calldata);
+ dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+ .rpc_call_prepare = nfs4_layoutreturn_prepare,
+ .rpc_call_done = nfs4_layoutreturn_done,
+ .rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+ .rpc_argp = &lrp->args,
+ .rpc_resp = &lrp->res,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = lrp->clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutreturn_call_ops,
+ .callback_data = lrp,
+ };
+ int status;
+
+ dprintk("--> %s\n", __func__);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = task->tk_status;
+ dprintk("<-- %s status=%d\n", __func__, status);
+ rpc_put_task(task);
+ return status;
+}
+
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
{
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 036f5adc9e1f..e97dd219f84f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1466,7 +1466,10 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
#ifdef CONFIG_NFS_V4_1
void nfs4_schedule_session_recovery(struct nfs4_session *session)
{
- nfs4_schedule_lease_recovery(session->clp);
+ struct nfs_client *clp = session->clp;
+
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ nfs4_schedule_lease_recovery(clp);
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
@@ -1549,6 +1552,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
status = nfs4_recovery_handle_error(clp, status);
goto out;
}
+ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
/* create_session negotiated new slot table */
clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c3ccd2c46834..d869a5e5464b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
1 /* layoutupdate4 layout type */ + \
1 /* NULL filelayout layoutupdate4 payload */)
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
-
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 1 /* FIXME: opaque lrf_body always empty at the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+ 1 + decode_stateid_maxsz)
#else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
decode_putfh_maxsz + \
decode_layoutcommit_maxsz + \
decode_getattr_maxsz)
-
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz)
const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
compound_encode_hdr_maxsz +
@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
static int
encode_layoutcommit(struct xdr_stream *xdr,
+ struct inode *inode,
const struct nfs4_layoutcommit_args *args,
struct compound_hdr *hdr)
{
@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
NFS_SERVER(args->inode)->pnfs_curr_ld->id);
- p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
/* Only whole file layouts */
p = xdr_encode_hyper(p, 0); /* offset */
@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten);
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- *p++ = cpu_to_be32(0); /* no file layout payload */
+
+ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+ NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+ NFS_I(inode)->layout, xdr, args);
+ else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0); /* no layout-type payload */
+ }
hdr->nops++;
hdr->replen += decode_layoutcommit_maxsz;
return 0;
}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+ *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
+ *p++ = cpu_to_be32(args->layout_type);
+ *p++ = cpu_to_be32(IOMODE_ANY);
+ *p = cpu_to_be32(RETURN_FILE);
+ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+ p = xdr_encode_hyper(p, 0);
+ p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+ spin_lock(&args->inode->i_lock);
+ xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+ spin_unlock(&args->inode->i_lock);
+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+ NFS_I(args->inode)->layout, xdr, args);
+ } else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0);
+ }
+ hdr->nops++;
+ hdr->replen += decode_layoutreturn_maxsz;
+}
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
/*
* Encode LAYOUTCOMMIT request
*/
-static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
- struct xdr_stream *xdr,
- struct nfs4_layoutcommit_args *args)
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutcommit_args *args)
{
+ struct nfs4_layoutcommit_data *data =
+ container_of(args, struct nfs4_layoutcommit_data, args);
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, NFS_FH(args->inode), &hdr);
- encode_layoutcommit(xdr, args, &hdr);
+ encode_layoutcommit(xdr, data->args.inode, args, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
- return 0;
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutreturn(xdr, args, &hdr);
+ encode_nops(&hdr);
}
#endif /* CONFIG_NFS_V4_1 */
@@ -5203,6 +5271,27 @@ out_overflow:
return -EIO;
}
+static int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->lrs_present = be32_to_cpup(p);
+ if (res->lrs_present)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_layoutcommit(struct xdr_stream *xdr,
struct rpc_rqst *req,
struct nfs4_layoutcommit_res *res)
@@ -6320,6 +6409,30 @@ out:
}
/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutreturn(xdr, res);
+out:
+ return status;
+}
+
+/*
* Decode LAYOUTCOMMIT response
*/
static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
@@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
+ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
#endif /* CONFIG_NFS_V4_1 */
};
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c541093a5bf2..c4744e1d513c 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -87,7 +87,7 @@
#define NFS_ROOT "/tftpboot/%s"
/* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS "udp"
+#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
static char nfs_root_parms[256] __initdata = "";
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 000000000000..ed30ea072bb8
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 000000000000..9cf208df1f25
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1057 @@
+/*
+ * pNFS Objects layout implementation over open-osd initiator library
+ *
+ * Copyright (C) 2009 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <scsi/osd_initiator.h>
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define _LLU(x) ((unsigned long long)x)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
+struct objio_dev_ent {
+ struct nfs4_deviceid_node id_node;
+ struct osd_dev *od;
+};
+
+static void
+objio_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
+
+ dprintk("%s: free od=%p\n", __func__, de->od);
+ osduld_put_device(de->od);
+ kfree(de);
+}
+
+static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
+ const struct nfs4_deviceid *d_id)
+{
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *de;
+
+ d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
+ if (!d)
+ return NULL;
+
+ de = container_of(d, struct objio_dev_ent, id_node);
+ return de;
+}
+
+static struct objio_dev_ent *
+_dev_list_add(const struct nfs_server *nfss,
+ const struct nfs4_deviceid *d_id, struct osd_dev *od,
+ gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
+ struct objio_dev_ent *n;
+
+ if (!de) {
+ dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+ return NULL;
+ }
+
+ dprintk("%s: Adding od=%p\n", __func__, od);
+ nfs4_init_deviceid_node(&de->id_node,
+ nfss->pnfs_curr_ld,
+ nfss->nfs_client,
+ d_id);
+ de->od = od;
+
+ d = nfs4_insert_deviceid_node(&de->id_node);
+ n = container_of(d, struct objio_dev_ent, id_node);
+ if (n != de) {
+ dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+ objio_free_deviceid_node(&de->id_node);
+ de = n;
+ }
+
+ atomic_inc(&de->id_node.ref);
+ return de;
+}
+
+struct caps_buffers {
+ u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
+ u8 creds[OSD_CAP_LEN];
+};
+
+struct objio_segment {
+ struct pnfs_layout_segment lseg;
+
+ struct pnfs_osd_object_cred *comps;
+
+ unsigned mirrors_p1;
+ unsigned stripe_unit;
+ unsigned group_width; /* Data stripe_units without integrity comps */
+ u64 group_depth;
+ unsigned group_count;
+
+ unsigned max_io_size;
+
+ unsigned comps_index;
+ unsigned num_comps;
+ /* variable length */
+ struct objio_dev_ent *ods[];
+};
+
+static inline struct objio_segment *
+OBJIO_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg, struct objio_segment, lseg);
+}
+
+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+
+struct objio_state {
+ /* Generic layer */
+ struct objlayout_io_state ol_state;
+
+ struct objio_segment *layout;
+
+ struct kref kref;
+ objio_done_fn done;
+ void *private;
+
+ unsigned long length;
+ unsigned numdevs; /* Actually used devs in this IO */
+ /* A per-device variable array of size numdevs */
+ struct _objio_per_comp {
+ struct bio *bio;
+ struct osd_request *or;
+ unsigned long length;
+ u64 offset;
+ unsigned dev;
+ } per_dev[];
+};
+
+/* Send and wait for a get_device_info of devices in the layout,
+ then look them up with the osd_initiator library */
+static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg, unsigned comp,
+ gfp_t gfp_flags)
+{
+ struct pnfs_osd_deviceaddr *deviceaddr;
+ struct nfs4_deviceid *d_id;
+ struct objio_dev_ent *ode;
+ struct osd_dev *od;
+ struct osd_dev_info odi;
+ int err;
+
+ d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
+
+ ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
+ if (ode)
+ return ode;
+
+ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
+ if (unlikely(err)) {
+ dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
+ __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
+ return ERR_PTR(err);
+ }
+
+ odi.systemid_len = deviceaddr->oda_systemid.len;
+ if (odi.systemid_len > sizeof(odi.systemid)) {
+ err = -EINVAL;
+ goto out;
+ } else if (odi.systemid_len)
+ memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+ odi.systemid_len);
+ odi.osdname_len = deviceaddr->oda_osdname.len;
+ odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
+
+ if (!odi.osdname_len && !odi.systemid_len) {
+ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+ __func__);
+ err = -ENODEV;
+ goto out;
+ }
+
+ od = osduld_info_lookup(&odi);
+ if (unlikely(IS_ERR(od))) {
+ err = PTR_ERR(od);
+ dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+ goto out;
+ }
+
+ ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
+ gfp_flags);
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ objlayout_put_deviceinfo(deviceaddr);
+ return err ? ERR_PTR(err) : ode;
+}
+
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg,
+ gfp_t gfp_flags)
+{
+ unsigned i;
+ int err;
+
+ /* lookup all devices */
+ for (i = 0; i < objio_seg->num_comps; i++) {
+ struct objio_dev_ent *ode;
+
+ ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+ if (unlikely(IS_ERR(ode))) {
+ err = PTR_ERR(ode);
+ goto out;
+ }
+ objio_seg->ods[i] = ode;
+ }
+ err = 0;
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ return err;
+}
+
+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+ struct pnfs_osd_data_map *data_map = &layout->olo_map;
+ u64 stripe_length;
+ u32 group_width;
+
+/* FIXME: Only raid0 for now. if not go through MDS */
+ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+ printk(KERN_ERR "Only RAID_0 for now\n");
+ return -ENOTSUPP;
+ }
+ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
+ data_map->odm_num_comps, data_map->odm_mirror_cnt);
+ return -EINVAL;
+ }
+
+ if (data_map->odm_group_width)
+ group_width = data_map->odm_group_width;
+ else
+ group_width = data_map->odm_num_comps /
+ (data_map->odm_mirror_cnt + 1);
+
+ stripe_length = (u64)data_map->odm_stripe_unit * group_width;
+ if (stripe_length >= (1ULL << 32)) {
+ printk(KERN_ERR "Total Stripe length(0x%llx)"
+ " >= 32bit is not supported\n", _LLU(stripe_length));
+ return -ENOTSUPP;
+ }
+
+ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+ printk(KERN_ERR "Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+ return -ENOTSUPP;
+ }
+
+ return 0;
+}
+
+static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
+ struct pnfs_osd_object_cred *src_comp,
+ struct caps_buffers *caps_p)
+{
+ WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
+ WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+
+ *cur_comp = *src_comp;
+
+ memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
+ sizeof(caps_p->caps_key));
+ cur_comp->oc_cap_key.cred = caps_p->caps_key;
+
+ memcpy(caps_p->creds, src_comp->oc_cap.cred,
+ sizeof(caps_p->creds));
+ cur_comp->oc_cap.cred = caps_p->creds;
+}
+
+int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_range *range,
+ struct xdr_stream *xdr,
+ gfp_t gfp_flags)
+{
+ struct objio_segment *objio_seg;
+ struct pnfs_osd_xdr_decode_layout_iter iter;
+ struct pnfs_osd_layout layout;
+ struct pnfs_osd_object_cred *cur_comp, src_comp;
+ struct caps_buffers *caps_p;
+ int err;
+
+ err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+ if (unlikely(err))
+ return err;
+
+ err = _verify_data_map(&layout);
+ if (unlikely(err))
+ return err;
+
+ objio_seg = kzalloc(sizeof(*objio_seg) +
+ sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+ sizeof(*objio_seg->comps) * layout.olo_num_comps +
+ sizeof(struct caps_buffers) * layout.olo_num_comps,
+ gfp_flags);
+ if (!objio_seg)
+ return -ENOMEM;
+
+ objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+ cur_comp = objio_seg->comps;
+ caps_p = (void *)(cur_comp + layout.olo_num_comps);
+ while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
+ copy_single_comp(cur_comp++, &src_comp, caps_p++);
+ if (unlikely(err))
+ goto err;
+
+ objio_seg->num_comps = layout.olo_num_comps;
+ objio_seg->comps_index = layout.olo_comps_index;
+ err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+ if (err)
+ goto err;
+
+ objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+ objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+ if (layout.olo_map.odm_group_width) {
+ objio_seg->group_width = layout.olo_map.odm_group_width;
+ objio_seg->group_depth = layout.olo_map.odm_group_depth;
+ objio_seg->group_count = layout.olo_map.odm_num_comps /
+ objio_seg->mirrors_p1 /
+ objio_seg->group_width;
+ } else {
+ objio_seg->group_width = layout.olo_map.odm_num_comps /
+ objio_seg->mirrors_p1;
+ objio_seg->group_depth = -1;
+ objio_seg->group_count = 1;
+ }
+
+ /* Cache this calculation it will hit for every page */
+ objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+ objio_seg->stripe_unit) *
+ objio_seg->group_width;
+
+ *outp = &objio_seg->lseg;
+ return 0;
+
+err:
+ kfree(objio_seg);
+ dprintk("%s: Error: return %d\n", __func__, err);
+ *outp = NULL;
+ return err;
+}
+
+void objio_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ int i;
+ struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+
+ for (i = 0; i < objio_seg->num_comps; i++) {
+ if (!objio_seg->ods[i])
+ break;
+ nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+ }
+ kfree(objio_seg);
+}
+
+int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+ struct objlayout_io_state **outp,
+ gfp_t gfp_flags)
+{
+ struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+ struct objio_state *ios;
+ const unsigned first_size = sizeof(*ios) +
+ objio_seg->num_comps * sizeof(ios->per_dev[0]);
+ const unsigned sec_size = objio_seg->num_comps *
+ sizeof(ios->ol_state.ioerrs[0]);
+
+ ios = kzalloc(first_size + sec_size, gfp_flags);
+ if (unlikely(!ios))
+ return -ENOMEM;
+
+ ios->layout = objio_seg;
+ ios->ol_state.ioerrs = ((void *)ios) + first_size;
+ ios->ol_state.num_comps = objio_seg->num_comps;
+
+ *outp = &ios->ol_state;
+ return 0;
+}
+
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+
+ kfree(ios);
+}
+
+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+ switch (oep) {
+ case OSD_ERR_PRI_NO_ERROR:
+ return (enum pnfs_osd_errno)0;
+
+ case OSD_ERR_PRI_CLEAR_PAGES:
+ BUG_ON(1);
+ return 0;
+
+ case OSD_ERR_PRI_RESOURCE:
+ return PNFS_OSD_ERR_RESOURCE;
+ case OSD_ERR_PRI_BAD_CRED:
+ return PNFS_OSD_ERR_BAD_CRED;
+ case OSD_ERR_PRI_NO_ACCESS:
+ return PNFS_OSD_ERR_NO_ACCESS;
+ case OSD_ERR_PRI_UNREACHABLE:
+ return PNFS_OSD_ERR_UNREACHABLE;
+ case OSD_ERR_PRI_NOT_FOUND:
+ return PNFS_OSD_ERR_NOT_FOUND;
+ case OSD_ERR_PRI_NO_SPACE:
+ return PNFS_OSD_ERR_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case OSD_ERR_PRI_EIO:
+ return PNFS_OSD_ERR_EIO;
+ }
+}
+
+static void _clear_bio(struct bio *bio)
+{
+ struct bio_vec *bv;
+ unsigned i;
+
+ __bio_for_each_segment(bv, bio, i, 0) {
+ unsigned this_count = bv->bv_len;
+
+ if (likely(PAGE_SIZE == this_count))
+ clear_highpage(bv->bv_page);
+ else
+ zero_user(bv->bv_page, bv->bv_offset, this_count);
+ }
+}
+
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+ int lin_ret = 0;
+ int i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_sense_info osi;
+ struct osd_request *or = ios->per_dev[i].or;
+ unsigned dev;
+ int ret;
+
+ if (!or)
+ continue;
+
+ ret = osd_req_decode_sense(or, &osi);
+ if (likely(!ret))
+ continue;
+
+ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+ /* start read offset passed endof file */
+ BUG_ON(is_write);
+ _clear_bio(ios->per_dev[i].bio);
+ dprintk("%s: start read offset passed end of file "
+ "offset=0x%llx, length=0x%lx\n", __func__,
+ _LLU(ios->per_dev[i].offset),
+ ios->per_dev[i].length);
+
+ continue; /* we recovered */
+ }
+ dev = ios->per_dev[i].dev;
+ objlayout_io_set_result(&ios->ol_state, dev,
+ &ios->layout->comps[dev].oc_object_id,
+ osd_pri_2_pnfs_err(osi.osd_err_pri),
+ ios->per_dev[i].offset,
+ ios->per_dev[i].length,
+ is_write);
+
+ if (osi.osd_err_pri >= oep) {
+ oep = osi.osd_err_pri;
+ lin_ret = ret;
+ }
+ }
+
+ return lin_ret;
+}
+
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+ unsigned i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct _objio_per_comp *per_dev = &ios->per_dev[i];
+
+ if (per_dev->or) {
+ osd_end_request(per_dev->or);
+ per_dev->or = NULL;
+ }
+
+ if (per_dev->bio) {
+ bio_put(per_dev->bio);
+ per_dev->bio = NULL;
+ }
+ }
+}
+
+struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
+{
+ unsigned min_dev = ios->layout->comps_index;
+ unsigned max_dev = min_dev + ios->layout->num_comps;
+
+ BUG_ON(dev < min_dev || max_dev <= dev);
+ return ios->layout->ods[dev - min_dev]->od;
+}
+
+struct _striping_info {
+ u64 obj_offset;
+ u64 group_length;
+ unsigned dev;
+ unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+ struct _striping_info *si)
+{
+ u32 stripe_unit = ios->layout->stripe_unit;
+ u32 group_width = ios->layout->group_width;
+ u64 group_depth = ios->layout->group_depth;
+ u32 U = stripe_unit * group_width;
+
+ u64 T = U * group_depth;
+ u64 S = T * ios->layout->group_count;
+ u64 M = div64_u64(file_offset, S);
+
+ /*
+ G = (L - (M * S)) / T
+ H = (L - (M * S)) % T
+ */
+ u64 LmodU = file_offset - M * S;
+ u32 G = div64_u64(LmodU, T);
+ u64 H = LmodU - G * T;
+
+ u32 N = div_u64(H, U);
+
+ div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+ si->obj_offset = si->unit_off + (N * stripe_unit) +
+ (M * group_depth * stripe_unit);
+
+ /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+ si->dev *= ios->layout->mirrors_p1;
+
+ si->group_length = T - H;
+}
+
+static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+ gfp_t gfp_flags)
+{
+ unsigned pg = *cur_pg;
+ struct request_queue *q =
+ osd_request_queue(_io_od(ios, per_dev->dev));
+
+ per_dev->length += cur_len;
+
+ if (per_dev->bio == NULL) {
+ unsigned stripes = ios->layout->num_comps /
+ ios->layout->mirrors_p1;
+ unsigned pages_in_stripe = stripes *
+ (ios->layout->stripe_unit / PAGE_SIZE);
+ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+ stripes;
+
+ if (BIO_MAX_PAGES_KMALLOC < bio_size)
+ bio_size = BIO_MAX_PAGES_KMALLOC;
+
+ per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
+ if (unlikely(!per_dev->bio)) {
+ dprintk("Faild to allocate BIO size=%u\n", bio_size);
+ return -ENOMEM;
+ }
+ }
+
+ while (cur_len > 0) {
+ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+ unsigned added_len;
+
+ BUG_ON(ios->ol_state.nr_pages <= pg);
+ cur_len -= pglen;
+
+ added_len = bio_add_pc_page(q, per_dev->bio,
+ ios->ol_state.pages[pg], pglen, pgbase);
+ if (unlikely(pglen != added_len))
+ return -ENOMEM;
+ pgbase = 0;
+ ++pg;
+ }
+ BUG_ON(cur_len);
+
+ *cur_pg = pg;
+ return 0;
+}
+
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+ struct _striping_info *si, unsigned *last_pg,
+ gfp_t gfp_flags)
+{
+ unsigned stripe_unit = ios->layout->stripe_unit;
+ unsigned mirrors_p1 = ios->layout->mirrors_p1;
+ unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+ unsigned dev = si->dev;
+ unsigned first_dev = dev - (dev % devs_in_group);
+ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+ unsigned cur_pg = *last_pg;
+ int ret = 0;
+
+ while (length) {
+ struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+ unsigned cur_len, page_off = 0;
+
+ if (!per_dev->length) {
+ per_dev->dev = dev;
+ if (dev < si->dev) {
+ per_dev->offset = si->obj_offset + stripe_unit -
+ si->unit_off;
+ cur_len = stripe_unit;
+ } else if (dev == si->dev) {
+ per_dev->offset = si->obj_offset;
+ cur_len = stripe_unit - si->unit_off;
+ page_off = si->unit_off & ~PAGE_MASK;
+ BUG_ON(page_off &&
+ (page_off != ios->ol_state.pgbase));
+ } else { /* dev > si->dev */
+ per_dev->offset = si->obj_offset - si->unit_off;
+ cur_len = stripe_unit;
+ }
+
+ if (max_comp < dev)
+ max_comp = dev;
+ } else {
+ cur_len = stripe_unit;
+ }
+ if (cur_len >= length)
+ cur_len = length;
+
+ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+ cur_len, gfp_flags);
+ if (unlikely(ret))
+ goto out;
+
+ dev += mirrors_p1;
+ dev = (dev % devs_in_group) + first_dev;
+
+ length -= cur_len;
+ ios->length += cur_len;
+ }
+out:
+ ios->numdevs = max_comp + mirrors_p1;
+ *last_pg = cur_pg;
+ return ret;
+}
+
+static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
+{
+ u64 length = ios->ol_state.count;
+ u64 offset = ios->ol_state.offset;
+ struct _striping_info si;
+ unsigned last_pg = 0;
+ int ret = 0;
+
+ while (length) {
+ _calc_stripe_info(ios, offset, &si);
+
+ if (length < si.group_length)
+ si.group_length = length;
+
+ ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
+ if (unlikely(ret))
+ goto out;
+
+ offset += si.group_length;
+ length -= si.group_length;
+ }
+
+out:
+ if (!ios->length)
+ return ret;
+
+ return 0;
+}
+
+static ssize_t _sync_done(struct objio_state *ios)
+{
+ struct completion *waiting = ios->private;
+
+ complete(waiting);
+ return 0;
+}
+
+static void _last_io(struct kref *kref)
+{
+ struct objio_state *ios = container_of(kref, struct objio_state, kref);
+
+ ios->done(ios);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+ struct objio_state *ios = p;
+
+ kref_put(&ios->kref, _last_io);
+}
+
+static ssize_t _io_exec(struct objio_state *ios)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ ssize_t status = 0; /* sync status */
+ unsigned i;
+ objio_done_fn saved_done_fn = ios->done;
+ bool sync = ios->ol_state.sync;
+
+ if (sync) {
+ ios->done = _sync_done;
+ ios->private = &wait;
+ }
+
+ kref_init(&ios->kref);
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_request *or = ios->per_dev[i].or;
+
+ if (!or)
+ continue;
+
+ kref_get(&ios->kref);
+ osd_execute_request_async(or, _done_io, ios);
+ }
+
+ kref_put(&ios->kref, _last_io);
+
+ if (sync) {
+ wait_for_completion(&wait);
+ status = saved_done_fn(ios);
+ }
+
+ return status;
+}
+
+/*
+ * read
+ */
+static ssize_t _read_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, false);
+
+ _io_free(ios);
+
+ if (likely(!ret))
+ status = ios->length;
+ else
+ status = ret;
+
+ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+ struct osd_request *or = NULL;
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ unsigned dev = per_dev->dev;
+ struct pnfs_osd_object_cred *cred =
+ &ios->layout->comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ int ret;
+
+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+
+ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+
+err:
+ return ret;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
+ ret = _read_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ ios->done = _read_done;
+ return _io_exec(ios); /* In sync mode exec returns the io status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ ret = _io_rw_pagelist(ios, GFP_KERNEL);
+ if (unlikely(ret))
+ return ret;
+
+ return _read_exec(ios);
+}
+
+/*
+ * write
+ */
+static ssize_t _write_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, true);
+
+ _io_free(ios);
+
+ if (likely(!ret)) {
+ /* FIXME: should be based on the OSD's persistence model
+ * See OSD2r05 Section 4.13 Data persistence model */
+ ios->ol_state.committed = NFS_FILE_SYNC;
+ status = ios->length;
+ } else {
+ status = ret;
+ }
+
+ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+ unsigned dev = ios->per_dev[cur_comp].dev;
+ unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+ int ret;
+
+ for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+ struct osd_request *or = NULL;
+ struct pnfs_osd_object_cred *cred =
+ &ios->layout->comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ struct bio *bio;
+
+ or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+
+ if (per_dev != master_dev) {
+ bio = bio_kmalloc(GFP_NOFS,
+ master_dev->bio->bi_max_vecs);
+ if (unlikely(!bio)) {
+ dprintk("Faild to allocate BIO size=%u\n",
+ master_dev->bio->bi_max_vecs);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ __bio_clone(bio, master_dev->bio);
+ bio->bi_bdev = NULL;
+ bio->bi_next = NULL;
+ per_dev->bio = bio;
+ per_dev->dev = dev;
+ per_dev->length = master_dev->length;
+ per_dev->offset = master_dev->offset;
+ } else {
+ bio = master_dev->bio;
+ bio->bi_rw |= REQ_WRITE;
+ }
+
+ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+ }
+
+err:
+ return ret;
+}
+
+static ssize_t _write_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
+ ret = _write_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ ios->done = _write_done;
+ return _io_exec(ios); /* In sync mode exec returns the io->status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ /* TODO: ios->stable = stable; */
+ ret = _io_rw_pagelist(ios, GFP_NOFS);
+ if (unlikely(ret))
+ return ret;
+
+ return _write_exec(ios);
+}
+
+static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req)
+{
+ if (!pnfs_generic_pg_test(pgio, prev, req))
+ return false;
+
+ return pgio->pg_count + req->wb_bytes <=
+ OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+}
+
+static struct pnfs_layoutdriver_type objlayout_type = {
+ .id = LAYOUT_OSD2_OBJECTS,
+ .name = "LAYOUT_OSD2_OBJECTS",
+ .flags = PNFS_LAYOUTRET_ON_SETATTR,
+
+ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
+ .free_layout_hdr = objlayout_free_layout_hdr,
+
+ .alloc_lseg = objlayout_alloc_lseg,
+ .free_lseg = objlayout_free_lseg,
+
+ .read_pagelist = objlayout_read_pagelist,
+ .write_pagelist = objlayout_write_pagelist,
+ .pg_test = objio_pg_test,
+
+ .free_deviceid_node = objio_free_deviceid_node,
+
+ .encode_layoutcommit = objlayout_encode_layoutcommit,
+ .encode_layoutreturn = objlayout_encode_layoutreturn,
+};
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+ int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+ if (ret)
+ printk(KERN_INFO
+ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+ __func__, ret);
+ else
+ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+ __func__);
+ return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+ pnfs_unregister_layoutdriver(&objlayout_type);
+ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+ __func__);
+}
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 000000000000..dc3956c0de80
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,712 @@
+/*
+ * pNFS Objects layout driver high level definitions
+ *
+ * Copyright (C) 2007 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct objlayout *objlay;
+
+ objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+ if (objlay) {
+ spin_lock_init(&objlay->lock);
+ INIT_LIST_HEAD(&objlay->err_list);
+ }
+ dprintk("%s: Return %p\n", __func__, objlay);
+ return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct objlayout *objlay = OBJLAYOUT(lo);
+
+ dprintk("%s: objlay %p\n", __func__, objlay);
+
+ WARN_ON(!list_empty(&objlay->err_list));
+ kfree(objlay);
+}
+
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ int status = -ENOMEM;
+ struct xdr_stream stream;
+ struct xdr_buf buf = {
+ .pages = lgr->layoutp->pages,
+ .page_len = lgr->layoutp->len,
+ .buflen = lgr->layoutp->len,
+ .len = lgr->layoutp->len,
+ };
+ struct page *scratch;
+ struct pnfs_layout_segment *lseg;
+
+ dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ goto err_nofree;
+
+ xdr_init_decode(&stream, &buf, NULL);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
+ if (unlikely(status)) {
+ dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
+ status);
+ goto err;
+ }
+
+ __free_page(scratch);
+
+ dprintk("%s: Return %p\n", __func__, lseg);
+ return lseg;
+
+err:
+ __free_page(scratch);
+err_nofree:
+ dprintk("%s: Err Return=>%d\n", __func__, status);
+ return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+ if (unlikely(!lseg))
+ return;
+
+ objio_free_lseg(lseg);
+}
+
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+ struct page **pages,
+ unsigned pgbase,
+ loff_t offset,
+ size_t count,
+ struct pnfs_layout_segment *lseg,
+ void *rpcdata,
+ gfp_t gfp_flags)
+{
+ struct objlayout_io_state *state;
+ u64 lseg_end_offset;
+
+ dprintk("%s: allocating io_state\n", __func__);
+ if (objio_alloc_io_state(lseg, &state, gfp_flags))
+ return NULL;
+
+ BUG_ON(offset < lseg->pls_range.offset);
+ lseg_end_offset = end_offset(lseg->pls_range.offset,
+ lseg->pls_range.length);
+ BUG_ON(offset >= lseg_end_offset);
+ if (offset + count > lseg_end_offset) {
+ count = lseg->pls_range.length -
+ (offset - lseg->pls_range.offset);
+ dprintk("%s: truncated count %Zd\n", __func__, count);
+ }
+
+ if (pgbase > PAGE_SIZE) {
+ pages += pgbase >> PAGE_SHIFT;
+ pgbase &= ~PAGE_MASK;
+ }
+
+ INIT_LIST_HEAD(&state->err_list);
+ state->lseg = lseg;
+ state->rpcdata = rpcdata;
+ state->pages = pages;
+ state->pgbase = pgbase;
+ state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ state->offset = offset;
+ state->count = count;
+ state->sync = 0;
+
+ return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+ dprintk("%s: freeing io_state\n", __func__);
+ if (unlikely(!state))
+ return;
+
+ objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+ dprintk("%s: state %p status\n", __func__, state);
+
+ if (likely(state->status >= 0)) {
+ objlayout_free_io_state(state);
+ } else {
+ struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+ spin_lock(&objlay->lock);
+ objlay->delta_space_valid = OBJ_DSU_INVALID;
+ list_add(&objlay->err_list, &state->err_list);
+ spin_unlock(&objlay->lock);
+ }
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+ struct pnfs_osd_objid *pooid, int osd_error,
+ u64 offset, u64 length, bool is_write)
+{
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+
+ BUG_ON(index >= state->num_comps);
+ if (osd_error) {
+ ioerr->oer_component = *pooid;
+ ioerr->oer_comp_offset = offset;
+ ioerr->oer_comp_length = length;
+ ioerr->oer_iswrite = is_write;
+ ioerr->oer_errno = osd_error;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+ __func__, index, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+ } else {
+ /* User need not call if no error is reported */
+ ioerr->oer_errno = 0;
+ }
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+
+ pnfs_ld_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+ int eof = state->eof;
+ struct nfs_read_data *rdata;
+
+ state->status = status;
+ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+ rdata = state->rpcdata;
+ rdata->task.tk_status = status;
+ if (status >= 0) {
+ rdata->res.count = status;
+ rdata->res.eof = eof;
+ }
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_ld_read_done(rdata);
+ else {
+ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+ schedule_work(&rdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+ loff_t offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct objlayout_io_state *state;
+ ssize_t status = 0;
+ loff_t eof;
+
+ dprintk("%s: Begin inode %p offset %llu count %d\n",
+ __func__, rdata->inode, offset, (int)count);
+
+ eof = i_size_read(rdata->inode);
+ if (unlikely(offset + count > eof)) {
+ if (offset >= eof) {
+ status = 0;
+ rdata->res.count = 0;
+ rdata->res.eof = 1;
+ goto out;
+ }
+ count = eof - offset;
+ }
+
+ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+ rdata->args.pages, rdata->args.pgbase,
+ offset, count,
+ rdata->lseg, rdata,
+ GFP_KERNEL);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->eof = state->offset + state->count >= eof;
+
+ status = objio_read_pagelist(state);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ rdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+
+ pnfs_ld_write_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+ bool sync)
+{
+ struct nfs_write_data *wdata;
+
+ dprintk("%s: Begin\n", __func__);
+ wdata = state->rpcdata;
+ state->status = status;
+ wdata->task.tk_status = status;
+ if (status >= 0) {
+ wdata->res.count = status;
+ wdata->verf.committed = state->committed;
+ dprintk("%s: Return status %d committed %d\n",
+ __func__, wdata->task.tk_status,
+ wdata->verf.committed);
+ } else
+ dprintk("%s: Return status %d\n",
+ __func__, wdata->task.tk_status);
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_ld_write_done(wdata);
+ else {
+ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+ schedule_work(&wdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+ int how)
+{
+ struct objlayout_io_state *state;
+ ssize_t status;
+
+ dprintk("%s: Begin inode %p offset %llu count %u\n",
+ __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+ wdata->args.pages,
+ wdata->args.pgbase,
+ wdata->args.offset,
+ wdata->args.count,
+ wdata->lseg, wdata,
+ GFP_NOFS);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->sync = how & FLUSH_SYNC;
+
+ status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ wdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct pnfs_osd_layoutupdate lou;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+
+ spin_lock(&objlay->lock);
+ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+ lou.dsu_delta = objlay->delta_space_used;
+ objlay->delta_space_used = 0;
+ objlay->delta_space_valid = OBJ_DSU_INIT;
+ lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+ spin_unlock(&objlay->lock);
+
+ start = xdr_reserve_space(xdr, 4);
+
+ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+ dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+ lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
+static int
+err_prio(u32 oer_errno)
+{
+ switch (oer_errno) {
+ case 0:
+ return 0;
+
+ case PNFS_OSD_ERR_RESOURCE:
+ return OSD_ERR_PRI_RESOURCE;
+ case PNFS_OSD_ERR_BAD_CRED:
+ return OSD_ERR_PRI_BAD_CRED;
+ case PNFS_OSD_ERR_NO_ACCESS:
+ return OSD_ERR_PRI_NO_ACCESS;
+ case PNFS_OSD_ERR_UNREACHABLE:
+ return OSD_ERR_PRI_UNREACHABLE;
+ case PNFS_OSD_ERR_NOT_FOUND:
+ return OSD_ERR_PRI_NOT_FOUND;
+ case PNFS_OSD_ERR_NO_SPACE:
+ return OSD_ERR_PRI_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case PNFS_OSD_ERR_EIO:
+ return OSD_ERR_PRI_EIO;
+ }
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+ const struct pnfs_osd_ioerr *src_err)
+{
+ u64 dest_end, src_end;
+
+ if (!dest_err->oer_errno) {
+ *dest_err = *src_err;
+ /* accumulated device must be blank */
+ memset(&dest_err->oer_component.oid_device_id, 0,
+ sizeof(dest_err->oer_component.oid_device_id));
+
+ return;
+ }
+
+ if (dest_err->oer_component.oid_partition_id !=
+ src_err->oer_component.oid_partition_id)
+ dest_err->oer_component.oid_partition_id = 0;
+
+ if (dest_err->oer_component.oid_object_id !=
+ src_err->oer_component.oid_object_id)
+ dest_err->oer_component.oid_object_id = 0;
+
+ if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+ dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+ dest_end = end_offset(dest_err->oer_comp_offset,
+ dest_err->oer_comp_length);
+ src_end = end_offset(src_err->oer_comp_offset,
+ src_err->oer_comp_length);
+ if (dest_end < src_end)
+ dest_end = src_end;
+
+ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+ if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+ dest_err->oer_errno = src_err->oer_errno;
+ } else if (src_err->oer_iswrite) {
+ dest_err->oer_iswrite = true;
+ dest_err->oer_errno = src_err->oer_errno;
+ }
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, __be32 *p)
+{
+ struct objlayout_io_state *state, *tmp;
+ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ unsigned i;
+
+ for (i = 0; i < state->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ merge_ioerr(&accumulated_err, ioerr);
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+
+ pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct objlayout_io_state *state, *tmp;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+ start = xdr_reserve_space(xdr, 4);
+ BUG_ON(!start);
+
+ spin_lock(&objlay->lock);
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ __be32 *last_xdr = NULL, *p;
+ unsigned i;
+ int res = 0;
+
+ for (i = 0; i < state->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
+ if (unlikely(!p)) {
+ res = -E2BIG;
+ break; /* accumulated_error */
+ }
+
+ last_xdr = p;
+ pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+ }
+
+ /* TODO: use xdr_write_pages */
+ if (unlikely(res)) {
+ /* no space for even one error descriptor */
+ BUG_ON(!last_xdr);
+
+ /* we've encountered a situation with lots and lots of
+ * errors and no space to encode them all. Use the last
+ * available slot to report the union of all the
+ * remaining errors.
+ */
+ encode_accumulated_error(objlay, last_xdr);
+ goto loop_done;
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+loop_done:
+ spin_unlock(&objlay->lock);
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+ dprintk("%s: Return\n", __func__);
+}
+
+
+/*
+ * Get Device Info API for io engines
+ */
+struct objlayout_deviceinfo {
+ struct page *page;
+ struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+ gfp_t gfp_flags)
+{
+ struct objlayout_deviceinfo *odi;
+ struct pnfs_device pd;
+ struct super_block *sb;
+ struct page *page, **pages;
+ u32 *p;
+ int err;
+
+ page = alloc_page(gfp_flags);
+ if (!page)
+ return -ENOMEM;
+
+ pages = &page;
+ pd.pages = pages;
+
+ memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+ pd.layout_type = LAYOUT_OSD2_OBJECTS;
+ pd.pages = &page;
+ pd.pgbase = 0;
+ pd.pglen = PAGE_SIZE;
+ pd.mincount = 0;
+
+ sb = pnfslay->plh_inode->i_sb;
+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+ if (err)
+ goto err_out;
+
+ p = page_address(page);
+ odi = kzalloc(sizeof(*odi), gfp_flags);
+ if (!odi) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+ odi->page = page;
+ *deviceaddr = &odi->da;
+ return 0;
+
+err_out:
+ __free_page(page);
+ return err;
+}
+
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+ struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+ struct objlayout_deviceinfo,
+ da);
+
+ __free_page(odi->page);
+ kfree(odi);
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 000000000000..a8244c8e042d
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
+/*
+ * Data types and function declerations for interfacing with the
+ * pNFS standard object layout driver.
+ *
+ * Copyright (C) 2007 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * per-inode layout
+ */
+struct objlayout {
+ struct pnfs_layout_hdr pnfs_layout;
+
+ /* for layout_commit */
+ enum osd_delta_space_valid_enum {
+ OBJ_DSU_INIT = 0,
+ OBJ_DSU_VALID,
+ OBJ_DSU_INVALID,
+ } delta_space_valid;
+ s64 delta_space_used; /* consumed by write ops */
+
+ /* for layout_return */
+ spinlock_t lock;
+ struct list_head err_list;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct objlayout, pnfs_layout);
+}
+
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+ struct pnfs_layout_segment *lseg;
+
+ struct page **pages;
+ unsigned pgbase;
+ unsigned nr_pages;
+ unsigned long count;
+ loff_t offset;
+ bool sync;
+
+ void *rpcdata;
+ int status; /* res */
+ int eof; /* res */
+ int committed; /* res */
+
+ /* Error reporting (layout_return) */
+ struct list_head err_list;
+ unsigned num_comps;
+ /* Pointer to array of error descriptors of size num_comps.
+ * It should contain as many entries as devices in the osd_layout
+ * that participate in the I/O. It is up to the io_engine to allocate
+ * needed space and set num_comps.
+ */
+ struct pnfs_osd_ioerr *ioerrs;
+};
+
+/*
+ * Raid engine I/O API
+ */
+extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_range *range,
+ struct xdr_stream *xdr,
+ gfp_t gfp_flags);
+extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
+
+extern int objio_alloc_io_state(
+ struct pnfs_layout_segment *lseg,
+ struct objlayout_io_state **outp,
+ gfp_t gfp_flags);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+ bool stable);
+
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+ unsigned index, struct pnfs_osd_objid *pooid,
+ int osd_error, u64 offset, u64 length, bool is_write);
+
+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+ struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+ /* If one of the I/Os errored out and the delta_space_used was
+ * invalid we render the complete report as invalid. Protocol mandate
+ * the DSU be accurate or not reported.
+ */
+ spin_lock(&objlay->lock);
+ if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+ objlay->delta_space_valid = OBJ_DSU_VALID;
+ objlay->delta_space_used += space_used;
+ }
+ spin_unlock(&objlay->lock);
+}
+
+extern void objlayout_read_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+
+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+ gfp_t gfp_flags);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+
+/*
+ * exported generic objects function vectors
+ */
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+ struct pnfs_layout_hdr *,
+ struct nfs4_layoutget_res *,
+ gfp_t gfp_flags);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+extern enum pnfs_try_status objlayout_read_pagelist(
+ struct nfs_read_data *);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+ struct nfs_write_data *,
+ int how);
+
+extern void objlayout_encode_layoutcommit(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutcommit_args *);
+
+extern void objlayout_encode_layoutreturn(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutreturn_args *);
+
+#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 000000000000..16fc758e9123
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
+/*
+ * Object-Based pNFS Layout XDR layer
+ *
+ * Copyright (C) 2007 Panasas Inc. [year of first publication]
+ * All rights reserved.
+ *
+ * Benny Halevy <bhalevy@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on RFC5664
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ * struct nfs4_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static __be32 *
+_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+ p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
+ sizeof(objid->oid_device_id.data));
+
+ p = xdr_decode_hyper(p, &objid->oid_partition_id);
+ p = xdr_decode_hyper(p, &objid->oid_object_id);
+ return p;
+}
+/*
+ * struct pnfs_osd_opaque_cred {
+ * u32 cred_len;
+ * void *cred;
+ * }; // xdr size [variable]
+ * The return pointers are from the xdr buffer
+ */
+static int
+_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
+ struct xdr_stream *xdr)
+{
+ __be32 *p = xdr_inline_decode(xdr, 1);
+
+ if (!p)
+ return -EINVAL;
+
+ opaque_cred->cred_len = be32_to_cpu(*p++);
+
+ p = xdr_inline_decode(xdr, opaque_cred->cred_len);
+ if (!p)
+ return -EINVAL;
+
+ opaque_cred->cred = p;
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ * struct pnfs_osd_objid oc_object_id;
+ * u32 oc_osd_version;
+ * u32 oc_cap_key_sec;
+ * struct pnfs_osd_opaque_cred oc_cap_key
+ * struct pnfs_osd_opaque_cred oc_cap;
+ * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
+ */
+static int
+_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
+ struct xdr_stream *xdr)
+{
+ __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
+ int ret;
+
+ if (!p)
+ return -EIO;
+
+ p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+ comp->oc_osd_version = be32_to_cpup(p++);
+ comp->oc_cap_key_sec = be32_to_cpup(p);
+
+ ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
+ if (unlikely(ret))
+ return ret;
+
+ ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
+ return ret;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ * u32 odm_num_comps;
+ * u64 odm_stripe_unit;
+ * u32 odm_group_width;
+ * u32 odm_group_depth;
+ * u32 odm_mirror_cnt;
+ * u32 odm_raid_algorithm;
+ * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
+ */
+static inline int
+_osd_data_map_xdr_sz(void)
+{
+ return 4 + 8 + 4 + 4 + 4 + 4;
+}
+
+static __be32 *
+_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+ data_map->odm_num_comps = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
+ data_map->odm_group_width = be32_to_cpup(p++);
+ data_map->odm_group_depth = be32_to_cpup(p++);
+ data_map->odm_mirror_cnt = be32_to_cpup(p++);
+ data_map->odm_raid_algorithm = be32_to_cpup(p++);
+ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+ __func__,
+ data_map->odm_num_comps,
+ (unsigned long long)data_map->odm_stripe_unit,
+ data_map->odm_group_width,
+ data_map->odm_group_depth,
+ data_map->odm_mirror_cnt,
+ data_map->odm_raid_algorithm);
+ return p;
+}
+
+int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+ struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ memset(iter, 0, sizeof(*iter));
+
+ p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
+ if (unlikely(!p))
+ return -EINVAL;
+
+ p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+ layout->olo_comps_index = be32_to_cpup(p++);
+ layout->olo_num_comps = be32_to_cpup(p++);
+ iter->total_comps = layout->olo_num_comps;
+ return 0;
+}
+
+bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+ struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+ int *err)
+{
+ BUG_ON(iter->decoded_comps > iter->total_comps);
+ if (iter->decoded_comps == iter->total_comps)
+ return false;
+
+ *err = _osd_xdr_decode_object_cred(comp, xdr);
+ if (unlikely(*err)) {
+ dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
+ "total_comps=%d\n", __func__, *err,
+ iter->decoded_comps, iter->total_comps);
+ return false; /* stop the loop */
+ }
+ dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "key_len=%u cap_len=%u\n",
+ __func__,
+ _DEVID_LO(&comp->oc_object_id.oid_device_id),
+ _DEVID_HI(&comp->oc_object_id.oid_device_id),
+ comp->oc_object_id.oid_partition_id,
+ comp->oc_object_id.oid_object_id,
+ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+
+ iter->decoded_comps++;
+ return true;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, all
+ * variable strings fields are left inside the rpc buffer and are only
+ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ * should not be freed while the returned information is in use.
+ */
+/*
+ *struct nfs4_string {
+ * unsigned int len;
+ * char *data;
+ *}; // size [variable]
+ * NOTE: Returned string points to inside the XDR buffer
+ */
+static __be32 *
+__read_u8_opaque(__be32 *p, struct nfs4_string *str)
+{
+ str->len = be32_to_cpup(p++);
+ str->data = (char *)p;
+
+ p += XDR_QUADLEN(str->len);
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ * u32 oti_type;
+ * struct nfs4_string oti_scsi_device_id;
+ * };// size 4 + [variable]
+ */
+static __be32 *
+__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
+{
+ u32 oti_type;
+
+ oti_type = be32_to_cpup(p++);
+ targetid->oti_type = oti_type;
+
+ switch (oti_type) {
+ case OBJ_TARGET_SCSI_NAME:
+ case OBJ_TARGET_SCSI_DEVICE_ID:
+ p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
+ }
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ * struct nfs4_string r_netid;
+ * struct nfs4_string r_addr;
+ * };
+ */
+static __be32 *
+__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
+{
+ p = __read_u8_opaque(p, &netaddr->r_netid);
+ p = __read_u8_opaque(p, &netaddr->r_addr);
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ * u32 ota_available;
+ * struct pnfs_osd_net_addr ota_netaddr;
+ * };
+ */
+static __be32 *
+__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
+{
+ u32 ota_available;
+
+ ota_available = be32_to_cpup(p++);
+ targetaddr->ota_available = ota_available;
+
+ if (ota_available)
+ p = __read_net_addr(p, &targetaddr->ota_netaddr);
+
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ * struct pnfs_osd_targetid oda_targetid;
+ * struct pnfs_osd_targetaddr oda_targetaddr;
+ * u8 oda_lun[8];
+ * struct nfs4_string oda_systemid;
+ * struct pnfs_osd_object_cred oda_root_obj_cred;
+ * struct nfs4_string oda_osdname;
+ * };
+ */
+
+/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
+ * not have an xdr_stream
+ */
+static __be32 *
+__read_opaque_cred(__be32 *p,
+ struct pnfs_osd_opaque_cred *opaque_cred)
+{
+ opaque_cred->cred_len = be32_to_cpu(*p++);
+ opaque_cred->cred = p;
+ return p + XDR_QUADLEN(opaque_cred->cred_len);
+}
+
+static __be32 *
+__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
+{
+ p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+ comp->oc_osd_version = be32_to_cpup(p++);
+ comp->oc_cap_key_sec = be32_to_cpup(p++);
+
+ p = __read_opaque_cred(p, &comp->oc_cap_key);
+ p = __read_opaque_cred(p, &comp->oc_cap);
+ return p;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+ struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+ p = __read_targetid(p, &deviceaddr->oda_targetid);
+
+ p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
+
+ p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
+ sizeof(deviceaddr->oda_lun));
+
+ p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
+
+ p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
+
+ p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
+
+ /* libosd likes this terminated in dbg. It's last, so no problems */
+ deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ * u32 dsu_valid;
+ * s64 dsu_delta;
+ * u32 olu_ioerr_flag;
+ * }; xdr size 4 + 8 + 4
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+ struct pnfs_osd_layoutupdate *lou)
+{
+ __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
+
+ if (!p)
+ return -E2BIG;
+
+ *p++ = cpu_to_be32(lou->dsu_valid);
+ if (lou->dsu_valid)
+ p = xdr_encode_hyper(p, lou->dsu_delta);
+ *p++ = cpu_to_be32(lou->olu_ioerr_flag);
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ * struct nfs4_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static inline __be32 *
+pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
+{
+ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+ sizeof(object_id->oid_device_id.data));
+ p = xdr_encode_hyper(p, object_id->oid_partition_id);
+ p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ * struct pnfs_osd_objid oer_component;
+ * u64 oer_comp_offset;
+ * u64 oer_comp_length;
+ * u32 oer_iswrite;
+ * u32 oer_errno;
+ * }; // xdr size 32 + 24 bytes
+ */
+void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
+{
+ p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
+ p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+ p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+ *p++ = cpu_to_be32(ioerr->oer_iswrite);
+ *p = cpu_to_be32(ioerr->oer_errno);
+}
+
+__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 32 + 24);
+ if (unlikely(!p))
+ dprintk("%s: out of xdr space\n", __func__);
+
+ return p;
+}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c80add6e2213..7913961aff22 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
TASK_UNINTERRUPTIBLE);
}
+static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+{
+ /*
+ * FIXME: ideally we should be able to coalesce all requests
+ * that are not block boundary aligned, but currently this
+ * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+ * since nfs_flush_multi and nfs_pagein_multi assume you
+ * can have only one struct nfs_page.
+ */
+ if (desc->pg_bsize < PAGE_SIZE)
+ return 0;
+
+ return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+}
+
/**
* nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor
@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_ioflags = io_flags;
desc->pg_error = 0;
desc->pg_lseg = NULL;
+ desc->pg_test = nfs_generic_pg_test;
+ pnfs_pageio_init(desc, inode);
}
/**
@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
*
* Return 'true' if this is the case, else return 'false'.
*/
-static int nfs_can_coalesce_requests(struct nfs_page *prev,
- struct nfs_page *req,
- struct nfs_pageio_descriptor *pgio)
+static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+ struct nfs_page *req,
+ struct nfs_pageio_descriptor *pgio)
{
if (req->wb_context->cred != prev->wb_context->cred)
- return 0;
+ return false;
if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
- return 0;
+ return false;
if (req->wb_context->state != prev->wb_context->state)
- return 0;
+ return false;
if (req->wb_index != (prev->wb_index + 1))
- return 0;
+ return false;
if (req->wb_pgbase != 0)
- return 0;
+ return false;
if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- return 0;
- /*
- * Non-whole file layouts need to check that req is inside of
- * pgio->pg_lseg.
- */
- if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
- return 0;
- return 1;
+ return false;
+ return pgio->pg_test(pgio, prev, req);
}
/**
@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
- size_t newlen = req->wb_bytes;
-
if (desc->pg_count != 0) {
struct nfs_page *prev;
- /*
- * FIXME: ideally we should be able to coalesce all requests
- * that are not block boundary aligned, but currently this
- * is problematic for the case of bsize < PAGE_CACHE_SIZE,
- * since nfs_flush_multi and nfs_pagein_multi assume you
- * can have only one struct nfs_page.
- */
- if (desc->pg_bsize < PAGE_SIZE)
- return 0;
- newlen += desc->pg_count;
- if (newlen > desc->pg_bsize)
- return 0;
prev = nfs_list_entry(desc->pg_list.prev);
if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
- } else
+ } else {
desc->pg_base = req->wb_pgbase;
+ }
nfs_list_remove_request(req);
nfs_list_add_request(req, &desc->pg_list);
- desc->pg_count = newlen;
+ desc->pg_count += req->wb_bytes;
return 1;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f57f5281a520..8c1309d852a6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
atomic_inc(&lo->plh_refcount);
}
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
+ kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
+
static void
destroy_layout_hdr(struct pnfs_layout_hdr *lo)
{
dprintk("%s: freeing layout cache %p\n", __func__, lo);
BUG_ON(!list_empty(&lo->plh_layouts));
NFS_I(lo->plh_inode)->layout = NULL;
- kfree(lo);
+ pnfs_free_layout_hdr(lo);
}
static void
@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
{
struct inode *inode = lseg->pls_layout->plh_inode;
- BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
list_del_init(&lseg->pls_list);
if (list_empty(&lseg->pls_layout->plh_segs)) {
set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(put_lseg);
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+/*
+ * is l2 fully contained in l1?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+ (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
+
static bool
-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+ struct pnfs_layout_range *recall_range)
{
- return (recall_iomode == IOMODE_ANY ||
- lseg_iomode == recall_iomode);
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ lo_seg_intersecting(lseg_range, recall_range);
}
/* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int
mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode)
+ struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
int invalid = 0, removed = 0;
@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return 0;
}
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+ if (!recall_range ||
+ should_free_lseg(&lseg->pls_range, recall_range)) {
dprintk("%s: freeing lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
lo = nfsi->layout;
if (lo) {
lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
+ mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
}
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- u32 iomode,
+ struct pnfs_layout_range *range,
gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
goto out_err_free;
}
- lgp->args.minlength = NFS4_MAX_UINT64;
+ lgp->args.minlength = PAGE_CACHE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range.iomode = iomode;
- lgp->args.range.offset = 0;
- lgp->args.range.length = NFS4_MAX_UINT64;
+ lgp->args.range = *range;
lgp->args.type = server->pnfs_curr_ld->id;
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
nfs4_proc_layoutget(lgp);
if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */
- set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+ set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
}
/* free xdr pages */
@@ -542,6 +619,51 @@ out_err_free:
return NULL;
}
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo = NULL;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ LIST_HEAD(tmp_list);
+ struct nfs4_layoutreturn *lrp;
+ nfs4_stateid stateid;
+ int status = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
+ spin_unlock(&ino->i_lock);
+ dprintk("%s: no layout segments to return\n", __func__);
+ goto out;
+ }
+ stateid = nfsi->layout->plh_stateid;
+ /* Reference matched in nfs4_layoutreturn_release */
+ get_layout_hdr(lo);
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+
+ WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
+
+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+ if (unlikely(lrp == NULL)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ lrp->args.stateid = stateid;
+ lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+ lrp->args.inode = ino;
+ lrp->clp = NFS_SERVER(ino)->nfs_client;
+
+ status = nfs4_proc_layoutreturn(lrp);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
bool pnfs_roc(struct inode *ino)
{
struct pnfs_layout_hdr *lo;
@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
* are seen first.
*/
static s64
-cmp_layout(u32 iomode1, u32 iomode2)
+cmp_layout(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
{
+ s64 d;
+
+ /* high offset > low offset */
+ d = l1->offset - l2->offset;
+ if (d)
+ return d;
+
+ /* short length > long length */
+ d = l2->length - l1->length;
+ if (d)
+ return d;
+
/* read > read/write */
- return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+ return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
}
static void
@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg)
{
struct pnfs_layout_segment *lp;
- int found = 0;
dprintk("%s:Begin\n", __func__);
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
lseg->pls_range.offset, lseg->pls_range.length,
lp, lp->pls_range.iomode, lp->pls_range.offset,
lp->pls_range.length);
- found = 1;
- break;
- }
- if (!found) {
- list_add_tail(&lseg->pls_list, &lo->plh_segs);
- dprintk("%s: inserted lseg %p "
- "iomode %d offset %llu length %llu at tail\n",
- __func__, lseg, lseg->pls_range.iomode,
- lseg->pls_range.offset, lseg->pls_range.length);
+ goto out;
}
+ list_add_tail(&lseg->pls_list, &lo->plh_segs);
+ dprintk("%s: inserted lseg %p "
+ "iomode %d offset %llu length %llu at tail\n",
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length);
+out:
get_layout_hdr(lo);
dprintk("%s:Return\n", __func__);
@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
{
struct pnfs_layout_hdr *lo;
- lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+ lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
if (!lo)
return NULL;
atomic_set(&lo->plh_refcount, 1);
@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
if (likely(nfsi->layout == NULL)) /* Won the race? */
nfsi->layout = new;
else
- kfree(new);
+ pnfs_free_layout_hdr(new);
return nfsi->layout;
}
@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
* READ RW true
*/
static int
-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+is_matching_lseg(struct pnfs_layout_range *ls_range,
+ struct pnfs_layout_range *range)
{
- return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+ struct pnfs_layout_range range1;
+
+ if ((range->iomode == IOMODE_RW &&
+ ls_range->iomode != IOMODE_RW) ||
+ !lo_seg_intersecting(ls_range, range))
+ return 0;
+
+ /* range1 covers only the first byte in the range */
+ range1 = *range;
+ range1.length = 1;
+ return lo_seg_contained(ls_range, &range1);
}
/*
* lookup range in layout
*/
static struct pnfs_layout_segment *
-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
{
struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
- is_matching_lseg(lseg, iomode)) {
+ is_matching_lseg(&lseg->pls_range, range)) {
ret = get_lseg(lseg);
break;
}
- if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(range, &lseg->pls_range) > 0)
break;
}
@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
enum pnfs_iomode iomode,
gfp_t gfp_flags)
{
+ struct pnfs_layout_range arg = {
+ .iomode = iomode,
+ .offset = pos,
+ .length = count,
+ };
+ unsigned pg_offset;
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
/* Check to see if the layout for the given range already exists */
- lseg = pnfs_find_lseg(lo, iomode);
+ lseg = pnfs_find_lseg(lo, &arg);
if (lseg)
goto out_unlock;
@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
spin_unlock(&clp->cl_lock);
}
- lseg = send_layoutget(lo, ctx, iomode, gfp_flags);
+ pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+ if (pg_offset) {
+ arg.offset -= pg_offset;
+ arg.length += pg_offset;
+ }
+ arg.length = PAGE_CACHE_ALIGN(arg.length);
+
+ lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
if (!lseg && first) {
spin_lock(&clp->cl_lock);
list_del_init(&lo->plh_layouts);
@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;
- /* Verify we got what we asked for.
- * Note that because the xdr parsing only accepts a single
- * element array, this can fail even if the server is behaving
- * correctly.
- */
- if (lgp->args.range.iomode > res->range.iomode ||
- res->range.offset != 0 ||
- res->range.length != NFS4_MAX_UINT64) {
- status = -EINVAL;
- goto out;
- }
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
if (!lseg || IS_ERR(lseg)) {
@@ -895,51 +1043,64 @@ out_forget_reply:
goto out;
}
-static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *prev,
- struct nfs_page *req)
+bool
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
{
+ enum pnfs_iomode access_type;
+ gfp_t gfp_flags;
+
+ /* We assume that pg_ioflags == 0 iff we're reading a page */
+ if (pgio->pg_ioflags == 0) {
+ access_type = IOMODE_READ;
+ gfp_flags = GFP_KERNEL;
+ } else {
+ access_type = IOMODE_RW;
+ gfp_flags = GFP_NOFS;
+ }
+
if (pgio->pg_count == prev->wb_bytes) {
/* This is first coelesce call for a series of nfs_pages */
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
prev->wb_context,
- IOMODE_READ,
- GFP_KERNEL);
+ req_offset(req),
+ pgio->pg_count,
+ access_type,
+ gfp_flags);
+ return true;
}
- return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
-void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-{
- struct pnfs_layoutdriver_type *ld;
+ if (pgio->pg_lseg &&
+ req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length))
+ return false;
- ld = NFS_SERVER(inode)->pnfs_curr_ld;
- pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
+ return true;
}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
-static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *prev,
- struct nfs_page *req)
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_write_done(struct nfs_write_data *data)
{
- if (pgio->pg_count == prev->wb_bytes) {
- /* This is first coelesce call for a series of nfs_pages */
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- prev->wb_context,
- IOMODE_RW,
- GFP_NOFS);
- }
- return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
+ int status;
-void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-{
- struct pnfs_layoutdriver_type *ld;
+ if (!data->pnfs_error) {
+ pnfs_set_layoutcommit(data);
+ data->mds_ops->rpc_call_done(&data->task, data);
+ data->mds_ops->rpc_release(data);
+ return 0;
+ }
- ld = NFS_SERVER(inode)->pnfs_curr_ld;
- pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+ dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+ data->pnfs_error);
+ status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
+ data->mds_ops, NFS_FILE_SYNC);
+ return status ? : -EAGAIN;
}
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
enum pnfs_try_status
pnfs_try_to_write_data(struct nfs_write_data *wdata,
@@ -966,6 +1127,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
}
/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_read_done(struct nfs_read_data *data)
+{
+ int status;
+
+ if (!data->pnfs_error) {
+ __nfs4_read_done_cb(data);
+ data->mds_ops->rpc_call_done(&data->task, data);
+ data->mds_ops->rpc_release(data);
+ return 0;
+ }
+
+ dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+ data->pnfs_error);
+ status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
+ data->mds_ops);
+ return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
+/*
* Call the appropriate parallel I/O subsystem read function.
*/
enum pnfs_try_status
@@ -1009,7 +1193,7 @@ void
pnfs_set_layoutcommit(struct nfs_write_data *wdata)
{
struct nfs_inode *nfsi = NFS_I(wdata->inode);
- loff_t end_pos = wdata->args.offset + wdata->res.count;
+ loff_t end_pos = wdata->mds_offset + wdata->res.count;
bool mark_as_dirty = false;
spin_lock(&nfsi->vfs_inode.i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0c015bad9e7a..48d0a8e4d062 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
enum {
@@ -64,17 +65,29 @@ enum {
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
+enum layoutdriver_policy_flags {
+ /* Should the pNFS client commit and return the layout upon a setattr */
+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+};
+
+struct nfs4_deviceid_node;
+
/* Per-layout driver specific registration structure */
struct pnfs_layoutdriver_type {
struct list_head pnfs_tblid;
const u32 id;
const char *name;
struct module *owner;
+ unsigned flags;
+
+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
/* test for nfs page cache coalescing */
- int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+ bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
/* Returns true if layoutdriver wants to divert this request to
* driver's commit routine.
@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
*/
enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+
+ void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+
+ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args);
+
+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args);
};
struct pnfs_layout_hdr {
@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev);
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
void put_lseg(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type, gfp_t gfp_flags);
+ loff_t pos, u64 count, enum pnfs_iomode access_type,
+ gfp_t gfp_flags);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
const struct rpc_call_ops *, int);
enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
const struct rpc_call_ops *);
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
+bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct nfs4_state *open_state);
int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode);
+ struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_ld_write_done(struct nfs_write_data *);
+int pnfs_ld_read_done(struct nfs_read_data *);
+
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+ struct hlist_node node;
+ const struct pnfs_layoutdriver_type *ld;
+ const struct nfs_client *nfs_client;
+ struct nfs4_deviceid deviceid;
+ atomic_t ref;
+};
+
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+ const struct pnfs_layoutdriver_type *,
+ const struct nfs_client *,
+ const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
static inline int lo_fail_bit(u32 iomode)
{
@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
put_lseg(req->wb_commit_lseg);
}
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_server *nfss = NFS_SERVER(ino);
+
+ if (pnfs_enabled_sb(nfss) && nfsi->layout)
+ return _pnfs_return_layout(ino);
+
+ return 0;
+}
+
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld)
+ pgio->pg_test = ld->pg_test;
+}
+
#else /* CONFIG_NFS_V4_1 */
static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
static inline struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type, gfp_t gfp_flags)
+ loff_t pos, u64 count, enum pnfs_iomode access_type,
+ gfp_t gfp_flags)
{
return NULL;
}
@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
return PNFS_NOT_ATTEMPTED;
}
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ return false;
+}
+
static inline bool
pnfs_roc(struct inode *ino)
{
@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
{
}
-static inline void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
-{
- pgio->pg_test = NULL;
-}
-
-static inline void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode)
{
- pgio->pg_test = NULL;
}
static inline void
@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
return 0;
}
+
+static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 000000000000..c65e133ce9c0
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,270 @@
+/*
+ * Device operations for the pnfs client.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ * Garth Goodson <Garth.Goodson@netapp.com>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS 5
+#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+ u32 *p = (u32 *)id;
+
+ dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+ unsigned char *cptr = (unsigned char *)id->data;
+ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+ u32 x = 0;
+
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id,
+ long hash)
+{
+ struct nfs4_deviceid_node *d;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+ if (d->ld == ld && d->nfs_client == clp &&
+ !memcmp(&d->deviceid, id, sizeof(*id))) {
+ if (atomic_read(&d->ref))
+ return d;
+ else
+ continue;
+ }
+ return NULL;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+struct nfs4_deviceid_node *
+_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id,
+ long hash)
+{
+ struct nfs4_deviceid_node *d;
+
+ rcu_read_lock();
+ d = _lookup_deviceid(ld, clp, id, hash);
+ if (d && !atomic_inc_not_zero(&d->ref))
+ d = NULL;
+ rcu_read_unlock();
+ return d;
+}
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Unhash and put deviceid
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+struct nfs4_deviceid_node *
+nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ struct nfs4_deviceid_node *d;
+
+ spin_lock(&nfs4_deviceid_lock);
+ rcu_read_lock();
+ d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ rcu_read_unlock();
+ if (!d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ return NULL;
+ }
+ hlist_del_init_rcu(&d->node);
+ spin_unlock(&nfs4_deviceid_lock);
+ synchronize_rcu();
+
+ /* balance the initial ref set in pnfs_insert_deviceid */
+ if (atomic_dec_and_test(&d->ref))
+ return d;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
+
+/*
+ * Delete a deviceid from cache
+ *
+ * @clp struct nfs_client qualifying the deviceid
+ * @id deviceid to delete
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ struct nfs4_deviceid_node *d;
+
+ d = nfs4_unhash_put_deviceid(ld, clp, id);
+ if (!d)
+ return;
+ d->ld->free_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+ const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *nfs_client,
+ const struct nfs4_deviceid *id)
+{
+ INIT_HLIST_NODE(&d->node);
+ d->ld = ld;
+ d->nfs_client = nfs_client;
+ d->deviceid = *id;
+ atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Uniquely initialize and insert a deviceid node into cache
+ *
+ * @new new deviceid node
+ * Note that the caller must set up the following members:
+ * new->ld
+ * new->nfs_client
+ * new->deviceid
+ *
+ * @ret the inserted node, if none found, otherwise, the found entry.
+ */
+struct nfs4_deviceid_node *
+nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
+{
+ struct nfs4_deviceid_node *d;
+ long hash;
+
+ spin_lock(&nfs4_deviceid_lock);
+ hash = nfs4_deviceid_hash(&new->deviceid);
+ d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ return d;
+ }
+
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * @ret true iff the node was deleted
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
+ return false;
+ hlist_del_init_rcu(&d->node);
+ spin_unlock(&nfs4_deviceid_lock);
+ synchronize_rcu();
+ d->ld->free_deviceid_node(d);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+ struct nfs4_deviceid_node *d;
+ struct hlist_node *n, *next;
+ HLIST_HEAD(tmp);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+ if (d->nfs_client == clp && atomic_read(&d->ref)) {
+ hlist_del_init_rcu(&d->node);
+ hlist_add_head(&d->node, &tmp);
+ }
+ rcu_read_unlock();
+
+ if (hlist_empty(&tmp))
+ return;
+
+ synchronize_rcu();
+ hlist_for_each_entry_safe(d, n, next, &tmp, node)
+ if (atomic_dec_and_test(&d->ref))
+ d->ld->free_deviceid_node(d);
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+ long h;
+
+ spin_lock(&nfs4_deviceid_lock);
+ for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+ _deviceid_purge_client(clp, h);
+ spin_unlock(&nfs4_deviceid_lock);
+}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2bcf0dc306a1..20a7f952e244 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests);
BUG_ON(desc->pg_lseg != NULL);
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_READ, GFP_KERNEL);
ClearPageError(page);
offset = 0;
nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
}
req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages))
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_READ, GFP_KERNEL);
ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
0, lseg);
@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
- pnfs_pageio_init_read(&pgio, inode);
if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e288f06d3fa7..ce40e5c568ba 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0;
}
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+ if (nfs4_has_session(server->nfs_client))
+ seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+ seq_printf(m, ",pnfs=");
+ if (server->pnfs_curr_ld)
+ seq_printf(m, "%s", server->pnfs_curr_ld->name);
+ else
+ seq_printf(m, "not configured");
+}
+#else /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
{
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+ show_sessions(m, nfss);
+ show_pnfs(m, nfss);
}
#endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c715b4ac92..e268e3b23497 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests);
BUG_ON(desc->pg_lseg);
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_RW, GFP_NOFS);
ClearPageError(page);
offset = 0;
nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
}
req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages))
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_RW, GFP_NOFS);
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
{
size_t wsize = NFS_SERVER(inode)->wsize;
- pnfs_pageio_init_write(pgio, inode);
-
if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ad000aeb21a2..b9566e46219f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
- if (rv)
- goto out;
- rv = check_nfsd_access(exp, rqstp);
- if (rv)
- fh_put(fhp);
-out:
exp_put(exp);
return rv;
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2247fc91d5e9..9095f3c21df9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
}
/* Now create the file and set attributes */
- nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len,
+ nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
attr, newfhp,
argp->createmode, argp->verf, NULL, NULL);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index ad48faca20fc..08c6e36ab2eb 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -842,7 +842,7 @@ out:
return rv;
}
-__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
{
struct svc_fh fh;
int err;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5fcb1396a7e3..3a6dbd70b34b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
/*
* Note: create modes (UNCHECKED,GUARDED...) are the same
- * in NFSv4 as in v3.
+ * in NFSv4 as in v3 except EXCLUSIVE4_1.
*/
- status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data,
+ status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
open->op_fname.len, &open->op_iattr,
&resfh, open->op_createmode,
(u32 *)open->op_verf.data,
@@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
putfh->pf_fhlen);
- return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+ return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
}
static __be32
@@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 err;
fh_init(&resfh, NFS4_FHSIZE);
+ err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
+ if (err)
+ return err;
err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
secinfo->si_name, secinfo->si_namelen,
&exp, &dentry);
@@ -986,6 +989,9 @@ enum nfsd4_op_flags {
ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */
+ /* For rfc 5661 section 2.6.3.1.1: */
+ OP_HANDLES_WRONGSEC = 1 << 3,
+ OP_IS_PUTFH_LIKE = 1 << 4,
};
struct nfsd4_operation {
@@ -1031,6 +1037,44 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
return nfs_ok;
}
+static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
+{
+ return &nfsd4_ops[op->opnum];
+}
+
+static bool need_wrongsec_check(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+ struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+ struct nfsd4_op *next = &argp->ops[resp->opcnt];
+ struct nfsd4_operation *thisd;
+ struct nfsd4_operation *nextd;
+
+ thisd = OPDESC(this);
+ /*
+ * Most ops check wronsec on our own; only the putfh-like ops
+ * have special rules.
+ */
+ if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
+ return false;
+ /*
+ * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
+ * put-filehandle operation if we're not going to use the
+ * result:
+ */
+ if (argp->opcnt == resp->opcnt)
+ return false;
+
+ nextd = OPDESC(next);
+ /*
+ * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
+ * errors themselves as necessary; others should check for them
+ * now:
+ */
+ return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+}
+
/*
* COMPOUND call.
*/
@@ -1108,7 +1152,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
goto encode_op;
}
- opdesc = &nfsd4_ops[op->opnum];
+ opdesc = OPDESC(op);
if (!cstate->current_fh.fh_dentry) {
if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
@@ -1126,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
else
BUG_ON(op->status == nfs_ok);
+ if (!op->status && need_wrongsec_check(rqstp))
+ op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+
encode_op:
/* Only from SEQUENCE */
if (resp->cstate.status == nfserr_replay_cache) {
@@ -1217,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOOKUP] = {
.op_func = (nfsd4op_func)nfsd4_lookup,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_LOOKUP",
},
[OP_LOOKUPP] = {
.op_func = (nfsd4op_func)nfsd4_lookupp,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_LOOKUPP",
},
[OP_NVERIFY] = {
@@ -1229,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_OPEN] = {
.op_func = (nfsd4op_func)nfsd4_open,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_OPEN",
},
[OP_OPEN_CONFIRM] = {
@@ -1241,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_PUTFH",
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_PUTPUBFH",
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_PUTROOTFH",
},
[OP_READ] = {
@@ -1281,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_RESTOREFH] = {
.op_func = (nfsd4op_func)nfsd4_restorefh,
- .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE,
.op_name = "OP_RESTOREFH",
},
[OP_SAVEFH] = {
.op_func = (nfsd4op_func)nfsd4_savefh,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SAVEFH",
},
[OP_SECINFO] = {
.op_func = (nfsd4op_func)nfsd4_secinfo,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO",
},
[OP_SETATTR] = {
@@ -1353,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
[OP_SECINFO_NO_NAME] = {
.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+ .op_flags = OP_HANDLES_WRONGSEC,
.op_name = "OP_SECINFO_NO_NAME",
},
};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4cf04e11c66c..e98f3c2e9492 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1519,6 +1519,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
bool confirm_me = false;
int status = 0;
+ if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
+ return nfserr_inval;
+
nfs4_lock_state();
unconf = find_unconfirmed_client(&cr_ses->clientid);
conf = find_confirmed_client(&cr_ses->clientid);
@@ -1637,8 +1640,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
return nfserr_badsession;
status = nfsd4_map_bcts_dir(&bcts->dir);
- nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
- return nfs_ok;
+ if (!status)
+ nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+ return status;
}
static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -1725,6 +1729,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
return;
}
+static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
+{
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+ return args->opcnt > session->se_fchannel.maxops;
+}
+
__be32
nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -1753,6 +1764,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (!session)
goto out;
+ status = nfserr_too_many_ops;
+ if (nfsd4_session_too_many_ops(rqstp, session))
+ goto out;
+
status = nfserr_badslot;
if (seq->slotid >= session->se_fchannel.maxreqs)
goto out;
@@ -1808,6 +1823,8 @@ out:
__be32
nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
{
+ int status = 0;
+
if (rc->rca_one_fs) {
if (!cstate->current_fh.fh_dentry)
return nfserr_nofilehandle;
@@ -1817,9 +1834,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
*/
return nfs_ok;
}
+
nfs4_lock_state();
- if (is_client_expired(cstate->session->se_client)) {
- nfs4_unlock_state();
+ status = nfserr_complete_already;
+ if (cstate->session->se_client->cl_firststate)
+ goto out;
+
+ status = nfserr_stale_clientid;
+ if (is_client_expired(cstate->session->se_client))
/*
* The following error isn't really legal.
* But we only get here if the client just explicitly
@@ -1827,11 +1849,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
* error it gets back on an operation for the dead
* client.
*/
- return nfserr_stale_clientid;
- }
+ goto out;
+
+ status = nfs_ok;
nfsd4_create_clid_dir(cstate->session->se_client);
+out:
nfs4_unlock_state();
- return nfs_ok;
+ return status;
}
__be32
@@ -2462,7 +2486,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
return NULL;
}
-int share_access_to_flags(u32 share_access)
+static int share_access_to_flags(u32 share_access)
{
share_access &= ~NFS4_SHARE_WANT_MASK;
@@ -2882,7 +2906,7 @@ out:
return status;
}
-struct lock_manager nfsd4_manager = {
+static struct lock_manager nfsd4_manager = {
};
static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6766af00d98..990181103214 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
{
DECODE_HEAD;
- u32 dummy;
READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
READ32(bcts->dir);
- /* XXX: Perhaps Tom Tucker could help us figure out how we
- * should be using ctsa_use_conn_in_rdma_mode: */
- READ32(dummy);
-
+ /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
+ * could help us figure out we should be using it. */
DECODE_TAIL;
}
@@ -588,8 +585,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
READ_BUF(lockt->lt_owner.len);
READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
- if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
- return nfserr_inval;
DECODE_TAIL;
}
@@ -3120,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
return nfserr;
}
-__be32
+static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
struct nfsd4_sequence *seq)
{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 55c8e63af0be..90c6aa6d5e0f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
* which clients virtually always use auth_sys for,
* even while using RPCSEC_GSS for NFS.
*/
- if (access & NFSD_MAY_LOCK)
+ if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
goto skip_pseudoflavor_check;
/*
* Clients may expect to be able to use auth_sys during mount,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 129f3c9f62d5..d5718273bb32 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct svc_export *exp;
struct dentry *dparent;
struct dentry *dentry;
- __be32 err;
int host_err;
dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
- /* Obtain dentry and export. */
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
- if (err)
- return err;
-
dparent = fhp->fh_dentry;
exp = fhp->fh_export;
exp_get(exp);
@@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
struct dentry *dentry;
__be32 err;
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (err)
+ return err;
err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
if (err)
return err;
@@ -877,13 +874,11 @@ static __be32
nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
{
- struct inode *inode;
mm_segment_t oldfs;
__be32 err;
int host_err;
err = nfserr_perm;
- inode = file->f_path.dentry->d_inode;
if (file->f_op->splice_read && rqstp->rq_splice_ok) {
struct splice_desc sd = {
@@ -1340,11 +1335,18 @@ out_nfserr:
}
#ifdef CONFIG_NFSD_V3
+
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+ return createmode == NFS3_CREATE_EXCLUSIVE
+ || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
/*
- * NFSv3 version of nfsd_create
+ * NFSv3 and NFSv4 version of nfsd_create
*/
__be32
-nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
struct svc_fh *resfhp, int createmode, u32 *verifier,
int *truncp, int *created)
@@ -1396,7 +1398,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
goto out;
- if (createmode == NFS3_CREATE_EXCLUSIVE) {
+ if (nfsd_create_is_exclusive(createmode)) {
/* solaris7 gets confused (bugid 4218508) if these have
* the high bit set, so just clear the high bits. If this is
* ever changed to use different attrs for storing the
@@ -1437,6 +1439,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
&& dchild->d_inode->i_atime.tv_sec == v_atime
&& dchild->d_inode->i_size == 0 )
break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
+ && dchild->d_inode->i_atime.tv_sec == v_atime
+ && dchild->d_inode->i_size == 0 )
+ goto set_attr;
/* fallthru */
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
@@ -1455,7 +1462,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_check_ignore_resizing(iap);
- if (createmode == NFS3_CREATE_EXCLUSIVE) {
+ if (nfsd_create_is_exclusive(createmode)) {
/* Cram the verifier into atime/mtime */
iap->ia_valid = ATTR_MTIME|ATTR_ATIME
| ATTR_MTIME_SET|ATTR_ATIME_SET;
@@ -2034,7 +2041,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
struct inode *inode = dentry->d_inode;
int err;
- if (acc == NFSD_MAY_NOP)
+ if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
return 0;
#if 0
dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9a370a5e36b7..e0bbac04d1dd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -17,10 +17,14 @@
#define NFSD_MAY_SATTR 8
#define NFSD_MAY_TRUNC 16
#define NFSD_MAY_LOCK 32
+#define NFSD_MAY_MASK 63
+
+/* extra hints to permission and open routines: */
#define NFSD_MAY_OWNER_OVERRIDE 64
#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
#define NFSD_MAY_NOT_BREAK_LEASE 512
+#define NFSD_MAY_BYPASS_GSS 1024
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -54,7 +58,7 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
int type, dev_t rdev, struct svc_fh *res);
#ifdef CONFIG_NFSD_V3
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
-__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
struct svc_fh *res, int createmode,
u32 *verifier, int *truncp, int *created);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 587f18432832..b954878ad6ce 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -917,7 +917,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
* construction. This function can be called both as a single operation
* and as a part of indivisible file operations.
*/
-void nilfs_dirty_inode(struct inode *inode)
+void nilfs_dirty_inode(struct inode *inode, int flags)
{
struct nilfs_transaction_info ti;
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1102a5fbb744..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
struct nilfs_transaction_info ti;
int err;
- dentry_unhash(dentry);
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
if (err)
return err;
@@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct nilfs_transaction_info ti;
int err;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
if (unlikely(err))
return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a9c6a531f80c..f02b9ad43a21 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -269,7 +269,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
extern int nilfs_mark_inode_dirty(struct inode *);
-extern void nilfs_dirty_inode(struct inode *);
+extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c368360c35a1..3b8d3979e03b 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -241,11 +241,9 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
int ret;
- if (S_ISDIR(inode->i_mode)) {
- dentry_unhash(dentry);
- if (!omfs_dir_is_empty(inode))
- return -ENOTEMPTY;
- }
+ if (S_ISDIR(inode->i_mode) &&
+ !omfs_dir_is_empty(inode))
+ return -ENOTEMPTY;
ret = omfs_delete_entry(dentry);
if (ret)
@@ -382,9 +380,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int err;
if (new_inode) {
- if (S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* overwriting existing file/dir */
err = omfs_remove(new_dir, new_dentry);
if (err)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f82e762eeca2..d545e97d99c3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,13 +255,7 @@ ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
- struct gendisk *disk = dev_to_disk(dev);
- unsigned int alignment = 0;
-
- if (disk->queue)
- alignment = queue_limit_discard_alignment(&disk->queue->limits,
- p->start_sect);
- return sprintf(buf, "%u\n", alignment);
+ return sprintf(buf, "%u\n", p->discard_alignment);
}
ssize_t part_stat_show(struct device *dev,
@@ -455,6 +449,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
p->start_sect = start;
p->alignment_offset =
queue_limit_alignment_offset(&disk->queue->limits, start);
+ p->discard_alignment =
+ queue_limit_discard_alignment(&disk->queue->limits, start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4ede550517a6..14def991d9dd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,9 @@
#include <linux/pid_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
#include "internal.h"
/* NOTE:
@@ -2842,6 +2845,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
INF("io", S_IRUGO, proc_tgid_io_accounting),
#endif
+#ifdef CONFIG_HARDWALL
+ INF("hardwall", S_IRUGO, proc_pid_hardwall),
+#endif
};
static int proc_tgid_base_readdir(struct file * filp,
@@ -3181,6 +3187,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
INF("io", S_IRUGO, proc_tid_io_accounting),
#endif
+#ifdef CONFIG_HARDWALL
+ INF("hardwall", S_IRUGO, proc_pid_hardwall),
+#endif
};
static int proc_tid_base_readdir(struct file * filp,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 76c8164d5651..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
- dentry_unhash(dentry);
-
/* we will be doing 2 balancings and update 2 stat data, we change quotas
* of the owner of the directory and of the owner of the parent directory.
* The quota structure is possibly deleted only on last iput => outside
@@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned long savelink = 1;
struct timespec ctime;
- if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
- dentry_unhash(new_dentry);
-
/* three balancings: (1) old name removal, (2) new name insertion
and (3) maybe "save" link insertion
stat data updates: (1) old directory,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b216ff6be1c9..aa91089162cb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -568,7 +568,7 @@ static void destroy_inodecache(void)
}
/* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode(struct inode *inode)
+static void reiserfs_dirty_inode(struct inode *inode, int flags)
{
struct reiserfs_transaction_handle th;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 50f1abccd1cd..e8a62f41b458 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -98,7 +98,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
I_MUTEX_CHILD, dir->i_sb);
- dentry_unhash(dentry);
error = dir->i_op->rmdir(dir, dentry);
if (!error)
dentry->d_inode->i_flags |= S_DEAD;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 730c56248c9b..5e1101ff276f 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -147,7 +147,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
* table[0] points to the first inode lookup table metadata block,
* this should be less than lookup_table_start
*/
- if (!IS_ERR(table) && table[0] >= lookup_table_start) {
+ if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 1516a6490bfb..0ed6edbc5c71 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -90,7 +90,7 @@ __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
* table[0] points to the first fragment table metadata block, this
* should be less than fragment_table_start
*/
- if (!IS_ERR(table) && table[0] >= fragment_table_start) {
+ if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index a70858e0fb44..d38ea3dab951 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -93,7 +93,7 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
* table[0] points to the first id lookup table metadata block, this
* should be less than id_table_start
*/
- if (!IS_ERR(table) && table[0] >= id_table_start) {
+ if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6f26abee3597..7438850c62d0 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -245,7 +245,7 @@ allocate_id_index_table:
msblk->id_table = NULL;
goto failed_mount;
}
- next_table = msblk->id_table[0];
+ next_table = le64_to_cpu(msblk->id_table[0]);
/* Handle inode lookup table */
lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
@@ -261,7 +261,7 @@ allocate_id_index_table:
msblk->inode_lookup_table = NULL;
goto failed_mount;
}
- next_table = msblk->inode_lookup_table[0];
+ next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
sb->s_export_op = &squashfs_export_ops;
@@ -286,7 +286,7 @@ handle_fragments:
msblk->fragment_index = NULL;
goto failed_mount;
}
- next_table = msblk->fragment_index[0];
+ next_table = le64_to_cpu(msblk->fragment_index[0]);
check_directory_table:
/* Sanity check directory_table */
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e2cc6756f3b1..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
struct inode *inode = dentry->d_inode;
int err = -ENOTEMPTY;
- dentry_unhash(dentry);
-
if (sysv_empty_dir(inode)) {
err = sysv_unlink(dir, dentry);
if (!err) {
@@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
struct sysv_dir_entry * old_de;
int err = -ENOENT;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_de = sysv_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c2b80943560d..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
- dentry_unhash(dentry);
-
/*
* Budget request settings: deletion direntry, deletion inode and
* changing the parent inode. If budgeting fails, go ahead anyway
@@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec time;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
/*
* Budget request settings: deletion direntry, new direntry, removing
* the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 166951e0dcd3..3be645e012c9 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -581,6 +581,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
ubifs_assert(wbuf->size % c->min_io_size == 0);
ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
ubifs_assert(!c->ro_media && !c->ro_mount);
+ ubifs_assert(!c->space_fixup);
if (c->leb_size - wbuf->offs >= c->max_write_size)
ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
@@ -759,6 +760,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
ubifs_assert(!c->ro_media && !c->ro_mount);
+ ubifs_assert(!c->space_fixup);
if (c->ro_error)
return -EROFS;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 34b1679e6e3a..cef0460f4c54 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -669,6 +669,7 @@ out_free:
out_release:
release_head(c, BASEHD);
+ kfree(dent);
out_ro:
ubifs_ro_mode(c, err);
if (last_reference)
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index bd644bf587a8..a5422fffbd69 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -674,7 +674,7 @@ static int kill_orphans(struct ubifs_info *c)
if (IS_ERR(sleb)) {
if (PTR_ERR(sleb) == -EUCLEAN)
sleb = ubifs_recover_leb(c, lnum, 0,
- c->sbuf, 0);
+ c->sbuf, -1);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 731d9e2e7b50..783d8e0beb76 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
}
/**
- * drop_last_node - drop the last node or group of nodes.
+ * drop_last_group - drop the last group of nodes.
* @sleb: scanned LEB information
* @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
*
* This is a helper function for 'ubifs_recover_leb()' which drops the last
- * node of the scanned LEB or the last group of nodes if @grouped is not zero.
- * This function returns %1 if a node was dropped and %0 otherwise.
+ * group of nodes of the scanned LEB.
*/
-static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
+static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
{
- int dropped = 0;
-
while (!list_empty(&sleb->nodes)) {
struct ubifs_scan_node *snod;
struct ubifs_ch *ch;
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
list);
ch = snod->node;
if (ch->group_type != UBIFS_IN_NODE_GROUP)
- return dropped;
- dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+ break;
+
+ dbg_rcvry("dropping grouped node at %d:%d",
+ sleb->lnum, snod->offs);
+ *offs = snod->offs;
+ list_del(&snod->list);
+ kfree(snod);
+ sleb->nodes_cnt -= 1;
+ }
+}
+
+/**
+ * drop_last_node - drop the last node.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB.
+ */
+static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
+{
+ struct ubifs_scan_node *snod;
+
+ if (!list_empty(&sleb->nodes)) {
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+ list);
+
+ dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
*offs = snod->offs;
list_del(&snod->list);
kfree(snod);
sleb->nodes_cnt -= 1;
- dropped = 1;
- if (!grouped)
- break;
}
- return dropped;
}
/**
@@ -604,7 +623,8 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
* @lnum: LEB number
* @offs: offset
* @sbuf: LEB-sized buffer to use
- * @grouped: nodes may be grouped for recovery
+ * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
+ * belong to any journal head)
*
* This function does a scan of a LEB, but caters for errors that might have
* been caused by the unclean unmount from which we are attempting to recover.
@@ -612,13 +632,14 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
* found, and a negative error code in case of failure.
*/
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
- int offs, void *sbuf, int grouped)
+ int offs, void *sbuf, int jhead)
{
int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
+ int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
struct ubifs_scan_leb *sleb;
void *buf = sbuf + offs;
- dbg_rcvry("%d:%d", lnum, offs);
+ dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
sleb = ubifs_start_scan(c, lnum, offs, sbuf);
if (IS_ERR(sleb))
@@ -635,7 +656,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
* Scan quietly until there is an error from which we cannot
* recover
*/
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
if (ret == SCANNED_A_NODE) {
/* A valid node, and not a padding node */
struct ubifs_ch *ch = buf;
@@ -695,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
* If nodes are grouped, always drop the incomplete group at
* the end.
*/
- drop_last_node(sleb, &offs, 1);
+ drop_last_group(sleb, &offs);
- /*
- * While we are in the middle of the same min. I/O unit keep dropping
- * nodes. So basically, what we want is to make sure that the last min.
- * I/O unit where we saw the corruption is dropped completely with all
- * the uncorrupted node which may possibly sit there.
- *
- * In other words, let's name the min. I/O unit where the corruption
- * starts B, and the previous min. I/O unit A. The below code tries to
- * deal with a situation when half of B contains valid nodes or the end
- * of a valid node, and the second half of B contains corrupted data or
- * garbage. This means that UBIFS had been writing to B just before the
- * power cut happened. I do not know how realistic is this scenario
- * that half of the min. I/O unit had been written successfully and the
- * other half not, but this is possible in our 'failure mode emulation'
- * infrastructure at least.
- *
- * So what is the problem, why we need to drop those nodes? Whey can't
- * we just clean-up the second half of B by putting a padding node
- * there? We can, and this works fine with one exception which was
- * reproduced with power cut emulation testing and happens extremely
- * rarely. The description follows, but it is worth noting that that is
- * only about the GC head, so we could do this trick only if the bud
- * belongs to the GC head, but it does not seem to be worth an
- * additional "if" statement.
- *
- * So, imagine the file-system is full, we run GC which is moving valid
- * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
- * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
- * and will try to continue. Imagine that LEB X is currently the
- * dirtiest LEB, and the amount of used space in LEB Y is exactly the
- * same as amount of free space in LEB X.
- *
- * And a power cut happens when nodes are moved from LEB X to LEB Y. We
- * are here trying to recover LEB Y which is the GC head LEB. We find
- * the min. I/O unit B as described above. Then we clean-up LEB Y by
- * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
- * fails, because it cannot find a dirty LEB which could be GC'd into
- * LEB Y! Even LEB X does not match because the amount of valid nodes
- * there does not fit the free space in LEB Y any more! And this is
- * because of the padding node which we added to LEB Y. The
- * user-visible effect of this which I once observed and analysed is
- * that we cannot mount the file-system with -ENOSPC error.
- *
- * So obviously, to make sure that situation does not happen we should
- * free min. I/O unit B in LEB Y completely and the last used min. I/O
- * unit in LEB Y should be A. This is basically what the below code
- * tries to do.
- */
- while (min_io_unit == round_down(offs, c->min_io_size) &&
- min_io_unit != offs &&
- drop_last_node(sleb, &offs, grouped));
+ if (jhead == GCHD) {
+ /*
+ * If this LEB belongs to the GC head then while we are in the
+ * middle of the same min. I/O unit keep dropping nodes. So
+ * basically, what we want is to make sure that the last min.
+ * I/O unit where we saw the corruption is dropped completely
+ * with all the uncorrupted nodes which may possibly sit there.
+ *
+ * In other words, let's name the min. I/O unit where the
+ * corruption starts B, and the previous min. I/O unit A. The
+ * below code tries to deal with a situation when half of B
+ * contains valid nodes or the end of a valid node, and the
+ * second half of B contains corrupted data or garbage. This
+ * means that UBIFS had been writing to B just before the power
+ * cut happened. I do not know how realistic is this scenario
+ * that half of the min. I/O unit had been written successfully
+ * and the other half not, but this is possible in our 'failure
+ * mode emulation' infrastructure at least.
+ *
+ * So what is the problem, why we need to drop those nodes? Why
+ * can't we just clean-up the second half of B by putting a
+ * padding node there? We can, and this works fine with one
+ * exception which was reproduced with power cut emulation
+ * testing and happens extremely rarely.
+ *
+ * Imagine the file-system is full, we run GC which starts
+ * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
+ * the current GC head LEB). The @c->gc_lnum is -1, which means
+ * that GC will retain LEB X and will try to continue. Imagine
+ * that LEB X is currently the dirtiest LEB, and the amount of
+ * used space in LEB Y is exactly the same as amount of free
+ * space in LEB X.
+ *
+ * And a power cut happens when nodes are moved from LEB X to
+ * LEB Y. We are here trying to recover LEB Y which is the GC
+ * head LEB. We find the min. I/O unit B as described above.
+ * Then we clean-up LEB Y by padding min. I/O unit. And later
+ * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
+ * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
+ * does not match because the amount of valid nodes there does
+ * not fit the free space in LEB Y any more! And this is
+ * because of the padding node which we added to LEB Y. The
+ * user-visible effect of this which I once observed and
+ * analysed is that we cannot mount the file-system with
+ * -ENOSPC error.
+ *
+ * So obviously, to make sure that situation does not happen we
+ * should free min. I/O unit B in LEB Y completely and the last
+ * used min. I/O unit in LEB Y should be A. This is basically
+ * what the below code tries to do.
+ */
+ while (offs > min_io_unit)
+ drop_last_node(sleb, &offs);
+ }
buf = sbuf + offs;
len = c->leb_size - offs;
@@ -881,7 +905,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
}
ubifs_scan_destroy(sleb);
}
- return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+ return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
}
/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 6617280d1679..5e97161ce4d3 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -557,8 +557,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
* these LEBs could possibly be written to at the power cut
* time.
*/
- sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
- b->bud->jhead != GCHD);
+ sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
else
sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
if (IS_ERR(sleb))
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 46961c003236..9e1d05666fed 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,13 +277,18 @@ static int kick_a_thread(void)
return 0;
}
-int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
int freed, contention = 0;
long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
if (nr == 0)
- return clean_zn_cnt;
+ /*
+ * Due to the way UBIFS updates the clean znode counter it may
+ * temporarily be negative.
+ */
+ return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
if (!clean_zn_cnt) {
/*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6db0bdaa9f74..b5aeb5a8ebed 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -382,7 +382,7 @@ done:
end_writeback(inode);
}
-static void ubifs_dirty_inode(struct inode *inode)
+static void ubifs_dirty_inode(struct inode *inode, int flags)
{
struct ubifs_inode *ui = ubifs_inode(inode);
@@ -811,15 +811,18 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
c->jheads[i].wbuf.jhead = i;
+ c->jheads[i].grouped = 1;
}
c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
/*
* Garbage Collector head likely contains long-term data and
- * does not need to be synchronized by timer.
+ * does not need to be synchronized by timer. Also GC head nodes are
+ * not grouped.
*/
c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
c->jheads[GCHD].wbuf.no_timer = 1;
+ c->jheads[GCHD].grouped = 0;
return 0;
}
@@ -1284,12 +1287,25 @@ static int mount_ubifs(struct ubifs_info *c)
if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
ubifs_msg("recovery needed");
c->need_recovery = 1;
- if (!c->ro_mount) {
- err = ubifs_recover_inl_heads(c, c->sbuf);
- if (err)
- goto out_master;
- }
- } else if (!c->ro_mount) {
+ }
+
+ if (c->need_recovery && !c->ro_mount) {
+ err = ubifs_recover_inl_heads(c, c->sbuf);
+ if (err)
+ goto out_master;
+ }
+
+ err = ubifs_lpt_init(c, 1, !c->ro_mount);
+ if (err)
+ goto out_master;
+
+ if (!c->ro_mount && c->space_fixup) {
+ err = ubifs_fixup_free_space(c);
+ if (err)
+ goto out_master;
+ }
+
+ if (!c->ro_mount) {
/*
* Set the "dirty" flag so that if we reboot uncleanly we
* will notice this immediately on the next mount.
@@ -1297,13 +1313,9 @@ static int mount_ubifs(struct ubifs_info *c)
c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
err = ubifs_write_master(c);
if (err)
- goto out_master;
+ goto out_lpt;
}
- err = ubifs_lpt_init(c, 1, !c->ro_mount);
- if (err)
- goto out_lpt;
-
err = dbg_check_idx_size(c, c->bi.old_idx_sz);
if (err)
goto out_lpt;
@@ -1396,12 +1408,6 @@ static int mount_ubifs(struct ubifs_info *c)
} else
ubifs_assert(c->lst.taken_empty_lebs > 0);
- if (!c->ro_mount && c->space_fixup) {
- err = ubifs_fixup_free_space(c);
- if (err)
- goto out_infos;
- }
-
err = dbg_check_filesystem(c);
if (err)
goto out_infos;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8119b1fd8d94..91b4213dde84 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2876,12 +2876,13 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
*/
void ubifs_tnc_close(struct ubifs_info *c)
{
- long clean_freed;
-
tnc_destroy_cnext(c);
if (c->zroot.znode) {
- clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
- atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+ long n;
+
+ ubifs_destroy_tnc_subtree(c->zroot.znode);
+ n = atomic_long_read(&c->clean_zn_cnt);
+ atomic_long_sub(n, &ubifs_clean_zn_cnt);
}
kfree(c->gap_lebs);
kfree(c->ilebs);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 93d1412a06f0..f79983d6f860 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -722,12 +722,14 @@ struct ubifs_bud {
* struct ubifs_jhead - journal head.
* @wbuf: head's write-buffer
* @buds_list: list of bud LEBs belonging to this journal head
+ * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
*
* Note, the @buds list is protected by the @c->buds_lock.
*/
struct ubifs_jhead {
struct ubifs_wbuf wbuf;
struct list_head buds_list;
+ unsigned int grouped:1;
};
/**
@@ -1614,7 +1616,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
int ubifs_tnc_end_commit(struct ubifs_info *c);
/* shrinker.c */
-int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
/* commit.c */
int ubifs_bg_thread(void *info);
@@ -1742,7 +1744,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
int ubifs_recover_master_node(struct ubifs_info *c);
int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
- int offs, void *sbuf, int grouped);
+ int offs, void *sbuf, int jhead);
struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf);
int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4d76594c2a8f..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
- dentry_unhash(dentry);
-
retval = -ENOENT;
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
if (!fi)
@@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (ofi) {
if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 953ebdfc5bf7..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err= -ENOTEMPTY;
- dentry_unhash(dentry);
-
lock_ufs(dir->i_sb);
if (ufs_empty_dir (inode)) {
err = ufs_unlink(dir, dentry);
@@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
- if (new_inode && S_ISDIR(new_inode->i_mode))
- dentry_unhash(new_dentry);
-
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/xattr.c b/fs/xattr.c
index f1ef94974dea..f060663ab70c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -46,18 +46,22 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return 0;
/*
- * The trusted.* namespace can only be accessed by a privileged user.
+ * The trusted.* namespace can only be accessed by privileged users.
*/
- if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+ if (!capable(CAP_SYS_ADMIN))
+ return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+ return 0;
+ }
- /* In user.* namespace, only regular files and directories can have
+ /*
+ * In the user.* namespace, only regular files and directories can have
* extended attributes. For sticky directories, only the owner and
- * privileged user can write attributes.
+ * privileged users can write attributes.
*/
if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
- return -EPERM;
+ return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
(mask & MAY_WRITE) && !inode_owner_or_capable(inode))
return -EPERM;
@@ -87,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
{
struct inode *inode = dentry->d_inode;
int error = -EOPNOTSUPP;
+ int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
+ if (issec)
+ inode->i_flags &= ~S_NOSEC;
if (inode->i_op->setxattr) {
error = inode->i_op->setxattr(dentry, name, value, size, flags);
if (!error) {
@@ -95,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
security_inode_post_setxattr(dentry, name, value,
size, flags);
}
- } else if (!strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN)) {
+ } else if (issec) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
error = security_inode_setsecurity(inode, suffix, value,
size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 98b9c91fcdf1..1e3a7ce804dc 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -925,7 +925,8 @@ xfs_fs_inode_init_once(
*/
STATIC void
xfs_fs_dirty_inode(
- struct inode *inode)
+ struct inode *inode,
+ int flags)
{
barrier();
XFS_I(inode)->i_update_core = 1;