259 files changed, 26685 insertions, 5261 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 5f8ab8adb5f5..ab5547ff29a1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -155,6 +156,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	root = d_alloc_root(inode);
 	if (!root) {
+		iput(inode);
 		retval = -ENOMEM;
 		goto release_sb;
 	}
@@ -173,10 +175,7 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
 	return 0;
 
 release_sb:
-	if (sb) {
-		up_write(&sb->s_umount);
-		deactivate_super(sb);
-	}
+	deactivate_locked_super(sb);
 
 free_stat:
 	kfree(st);
@@ -230,9 +229,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 static void
 v9fs_umount_begin(struct super_block *sb)
 {
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_session_info *v9ses;
 
+	lock_kernel();
+	v9ses = sb->s_fs_info;
 	v9fs_session_cancel(v9ses);
+	unlock_kernel();
 }
 
 static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 86b203fc3c56..9f7270f36b2a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -175,9 +175,34 @@ source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
-
 source "fs/exofs/Kconfig"
 
+config NILFS2_FS
+	tristate "NILFS2 file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select CRC32
+	help
+	  NILFS2 is a log-structured file system (LFS) supporting continuous
+	  snapshotting.  In addition to versioning capability of the entire
+	  file system, users can even restore files mistakenly overwritten or
+	  destroyed just a few seconds ago.  Since this file system can keep
+	  consistency like conventional LFS, it achieves quick recovery after
+	  system crashes.
+
+	  NILFS2 creates a number of checkpoints every few seconds or per
+	  synchronous write basis (unless there is no change).  Users can
+	  select significant versions among continuously created checkpoints,
+	  and can change them into snapshots which will be preserved for long
+	  periods until they are changed back to checkpoints.  Each
+	  snapshot is mountable as a read-only file system concurrently with
+	  its writable mount, and this feature is convenient for online backup.
+
+	  Some features including atime, extended attributes, and POSIX ACLs,
+	  are not supported yet.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called nilfs2.  If unsure, say N.
+
 endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 70b2aed87133..af6d04700d9c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
 obj-$(CONFIG_9P_FS)		+= 9p/
 obj-$(CONFIG_AFS_FS)		+= afs/
+obj-$(CONFIG_NILFS2_FS)		+= nilfs2/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5ce695e707fe..63f5183f263b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -507,8 +507,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
-	kfree(sb->s_options);
-	sb->s_options = new_opts;
+	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
 	sbi->s_mode  = mode;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7a1d942ef68d..0149dab365e7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -102,6 +102,7 @@ int afs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#ifdef CONFIG_AFS_FSCACHE
 /*
  * deal with notification that a page was read from the cache
  */
@@ -117,6 +118,7 @@ static void afs_file_readpage_read_complete(struct page *page,
 		SetPageUptodate(page);
 	unlock_page(page);
 }
+#endif
 
 /*
  * AFS read page from file, directory or symlink
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 49f189423063..7ad36506c256 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
 	struct net_device *dev;
 	int ret = -ENODEV;
 
-	if (maclen != ETH_ALEN)
-		BUG();
+	BUG_ON(maclen != ETH_ALEN);
 
 	rtnl_lock();
 	dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index aee239a048cb..76828e5f8a39 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -405,21 +405,20 @@ static int afs_get_sb(struct file_system_type *fs_type,
 		sb->s_flags = flags;
 		ret = afs_fill_super(sb, &params);
 		if (ret < 0) {
-			up_write(&sb->s_umount);
-			deactivate_super(sb);
+			deactivate_locked_super(sb);
 			goto error;
 		}
-		sb->s_options = new_opts;
+		save_mount_options(sb, new_opts);
 		sb->s_flags |= MS_ACTIVE;
 	} else {
 		_debug("reuse");
-		kfree(new_opts);
 		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
 	}
 
 	simple_set_mnt(mnt, sb);
 	afs_put_volume(params.volume);
 	afs_put_cell(params.cell);
+	kfree(new_opts);
 	_leave(" = 0 [%p]", sb);
 	return 0;
 
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index bf8c8af98004..4eb4d8dfb2f1 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 {
 	struct autofs_dirhash *dh = &sbi->dirhash;
 	struct autofs_dir_ent *ent;
-	struct dentry *dentry;
 	unsigned long timeout = sbi->exp_timeout;
 
 	while (1) {
+		struct path path;
+		int umount_ok;
+
 		if ( list_empty(&dh->expiry_head) || sbi->catatonic )
 			return NULL;	/* No entries */
 		/* We keep the list sorted by last_usage and want old stuff */
@@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 			return ent; /* Symlinks are always expirable */
 
 		/* Get the dentry for the autofs subdirectory */
-		dentry = ent->dentry;
+		path.dentry = ent->dentry;
 
-		if ( !dentry ) {
+		if (!path.dentry) {
 			/* Should only happen in catatonic mode */
 			printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
 			autofs_delete_usage(ent);
 			continue;
 		}
 
-		if ( !dentry->d_inode ) {
-			dput(dentry);
+		if (!path.dentry->d_inode) {
+			dput(path.dentry);
 			printk("autofs: negative dentry on expiry queue: %s\n",
 			       ent->name);
 			autofs_delete_usage(ent);
@@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 
 		/* Make sure entry is mounted and unused; note that dentry will
 		   point to the mounted-on-top root. */
-		if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) {
+		if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
+		    !d_mountpoint(path.dentry)) {
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		mntget(mnt);
-		dget(dentry);
-		if (!follow_down(&mnt, &dentry)) {
-			dput(dentry);
-			mntput(mnt);
+		path.mnt = mnt;
+		path_get(&path);
+		if (!follow_down(&path.mnt, &path.dentry)) {
+			path_put(&path);
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(dentry) && follow_down(&mnt, &dentry))
+		while (d_mountpoint(path.dentry) &&
+		       follow_down(&path.mnt, &path.dentry))
 			;
-		dput(dentry);
+		umount_ok = may_umount(path.mnt);
+		path_put(&path);
 
-		if ( may_umount(mnt) ) {
-			mntput(mnt);
+		if (umount_ok) {
 			DPRINTK(("autofs: signaling expire on %s\n", ent->name));
 			return ent; /* Expirable! */
 		}
 		DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-		mntput(mnt);
 	}
 	return NULL;		/* No expirable entries */
 }
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9e5ae8a4f5c8..84168c0dcc2d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -54,11 +54,10 @@ static int check_name(const char *name)
  * Check a string doesn't overrun the chunk of
  * memory we copied from user land.
  */
-static int invalid_str(char *str, void *end)
+static int invalid_str(char *str, size_t size)
 {
-	while ((void *) str <= end)
-		if (!*str++)
-			return 0;
+	if (memchr(str, 0, size))
+		return 0;
 	return -EINVAL;
 }
 
@@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 	}
 
 	if (param->size > sizeof(*param)) {
-		err = invalid_str(param->path,
-				 (void *) ((size_t) param + param->size));
+		err = invalid_str(param->path, param->size - sizeof(*param));
 		if (err) {
 			AUTOFS_WARN(
 			  "path string terminator missing for cmd(0x%08x)",
@@ -488,7 +486,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 	}
 
 	path = param->path;
-	devid = sbi->sb->s_dev;
+	devid = new_encode_dev(sbi->sb->s_dev);
 
 	param->requester.uid = param->requester.gid = -1;
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 75f7ddacf7d6..3077d8f16523 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -70,8 +70,10 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		 * Otherwise it's an offset mount and we need to check
 		 * if we can umount its mount, if there is one.
 		 */
-		if (!d_mountpoint(dentry))
+		if (!d_mountpoint(dentry)) {
+			status = 0;
 			goto done;
+		}
 	}
 
 	/* Update the expiry counter if fs is busy */
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 
 #endif				/* __KERNEL__ */
 
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 41f2b4d0093e..ca40f828f64d 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/fs.h>
+#include <asm/page.h> /* for PAGE_SIZE */
 
 #include "befs.h"
 #include "super.h"
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 70cfc4b84ae0..fdb66faa24f1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1388,7 +1388,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
 	prstatus->pr_pid = task_pid_vnr(p);
-	prstatus->pr_ppid = task_pid_vnr(p->parent);
+	prstatus->pr_ppid = task_pid_vnr(p->real_parent);
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
@@ -1433,7 +1433,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_psargs[len] = 0;
 
 	psinfo->pr_pid = task_pid_vnr(p);
-	psinfo->pr_ppid = task_pid_vnr(p->parent);
+	psinfo->pr_ppid = task_pid_vnr(p->real_parent);
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
diff --git a/fs/bio.c b/fs/bio.c
index e0c9e545bbfa..98711647ece4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 	struct bio_vec *bvl;
 
 	/*
-	 * If 'bs' is given, lookup the pool and do the mempool alloc.
-	 * If not, this is a bio_kmalloc() allocation and just do a
-	 * kzalloc() for the exact number of vecs right away.
-	 */
-	if (!bs)
-		bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
-
-	/*
 	 * see comment near bvec_array define!
 	 */
 	switch (nr) {
@@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 	mempool_free(p, bs->bio_pool);
 }
 
-/*
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_fs_destructor(struct bio *bio)
-{
-	bio_free(bio, fs_bio_set);
-}
-
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-	if (bio_has_allocated_vec(bio))
-		kfree(bio->bi_io_vec);
-	kfree(bio);
-}
-
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
@@ -301,21 +278,15 @@ void bio_init(struct bio *bio)
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	unsigned long idx = BIO_POOL_NONE;
 	struct bio_vec *bvl = NULL;
-	struct bio *bio = NULL;
-	unsigned long idx = 0;
-	void *p = NULL;
-
-	if (bs) {
-		p = mempool_alloc(bs->bio_pool, gfp_mask);
-		if (!p)
-			goto err;
-		bio = p + bs->front_pad;
-	} else {
-		bio = kmalloc(sizeof(*bio), gfp_mask);
-		if (!bio)
-			goto err;
-	}
+	struct bio *bio;
+	void *p;
+
+	p = mempool_alloc(bs->bio_pool, gfp_mask);
+	if (unlikely(!p))
+		return NULL;
+	bio = p + bs->front_pad;
 
 	bio_init(bio);
 
@@ -332,22 +303,33 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		nr_iovecs = bvec_nr_vecs(idx);
 	}
+out_set:
 	bio->bi_flags |= idx << BIO_POOL_OFFSET;
 	bio->bi_max_vecs = nr_iovecs;
-out_set:
 	bio->bi_io_vec = bvl;
-
 	return bio;
 
 err_free:
-	if (bs)
-		mempool_free(p, bs->bio_pool);
-	else
-		kfree(bio);
-err:
+	mempool_free(p, bs->bio_pool);
 	return NULL;
 }
 
+static void bio_fs_destructor(struct bio *bio)
+{
+	bio_free(bio, fs_bio_set);
+}
+
+/**
+ *	bio_alloc - allocate a new bio, memory pool backed
+ *	@gfp_mask: allocation mask to use
+ *	@nr_iovecs: number of iovecs
+ *
+ *	Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask
+ *	contains __GFP_WAIT, the allocation is guaranteed to succeed.
+ *
+ *	RETURNS:
+ *	Pointer to new bio on success, NULL on failure.
+ */
 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 {
 	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -358,19 +340,45 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 	return bio;
 }
 
-/*
- * Like bio_alloc(), but doesn't use a mempool backing. This means that
- * it CAN fail, but while bio_alloc() can only be used for allocations
- * that have a short (finite) life span, bio_kmalloc() should be used
- * for more permanent bio allocations (like allocating some bio's for
- * initalization or setup purposes).
- */
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+	if (bio_integrity(bio))
+		bio_integrity_free(bio);
+	kfree(bio);
+}
+
+/**
+ * bio_alloc - allocate a bio for I/O
+ * @gfp_mask:   the GFP_ mask given to the slab allocator
+ * @nr_iovecs:	number of iovecs to pre-allocate
+ *
+ * Description:
+ *   bio_alloc will allocate a bio and associated bio_vec array that can hold
+ *   at least @nr_iovecs entries. Allocations will be done from the
+ *   fs_bio_set. Also see @bio_alloc_bioset.
+ *
+ *   If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
+ *   a bio. This is due to the mempool guarantees. To make this work, callers
+ *   must never allocate more than 1 bio at the time from this pool. Callers
+ *   that need to allocate more than 1 bio must always submit the previously
+ *   allocate bio for IO before attempting to allocate a new one. Failure to
+ *   do so can cause livelocks under memory pressure.
+ *
+ **/
 struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
-	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+	struct bio *bio;
 
-	if (bio)
-		bio->bi_destructor = bio_kmalloc_destructor;
+	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
+		      gfp_mask);
+	if (unlikely(!bio))
+		return NULL;
+
+	bio_init(bio);
+	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bio->bi_inline_vecs;
+	bio->bi_destructor = bio_kmalloc_destructor;
 
 	return bio;
 }
@@ -809,12 +817,15 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 		len += iov[i].iov_len;
 	}
 
+	if (offset)
+		nr_pages++;
+
 	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	bio = bio_alloc(gfp_mask, nr_pages);
+	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
@@ -938,7 +949,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (!nr_pages)
 		return ERR_PTR(-EINVAL);
 
-	bio = bio_alloc(gfp_mask, nr_pages);
+	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
@@ -1122,7 +1133,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
 	int offset, i;
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_mask, nr_pages);
+	bio = bio_kmalloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9adf5e4f7e96..94212844a9bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,25 +1,10 @@
-ifneq ($(KERNELRELEASE),)
-# kbuild part of makefile
 
 obj-$(CONFIG_BTRFS_FS) := btrfs.o
-btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
 	   compression.o delayed-ref.o
-else
-
-# Normal Makefile
-
-KERNELDIR := /lib/modules/`uname -r`/build
-all:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
-
-modules_install:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
-clean:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
-
-endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7fdd184a528d..cbba000dccbe 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		return ERR_PTR(-EINVAL);
 	}
 
+	/* Handle the cached NULL acl case without locking */
+	acl = ACCESS_ONCE(*p_acl);
+	if (!acl)
+		return acl;
+
 	spin_lock(&inode->i_lock);
-	if (*p_acl != BTRFS_ACL_NOT_CACHED)
-		acl = posix_acl_dup(*p_acl);
+	acl = *p_acl;
+	if (acl != BTRFS_ACL_NOT_CACHED)
+		acl = posix_acl_dup(acl);
 	spin_unlock(&inode->i_lock);
 
-	if (acl)
+	if (acl != BTRFS_ACL_NOT_CACHED)
 		return acl;
 
-
 	size = __btrfs_getxattr(inode, name, "", 0);
 	if (size > 0) {
 		value = kzalloc(size, GFP_NOFS);
@@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 			btrfs_update_cached_acl(inode, p_acl, acl);
 		}
 		kfree(value);
-	} else if (size == -ENOENT) {
+	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
+		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
 		btrfs_update_cached_acl(inode, p_acl, acl);
+	} else {
+		acl = ERR_PTR(-EIO);
 	}
 
 	return acl;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 51bfdfc8fcda..502c3d61de62 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -25,6 +25,7 @@
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 
 /*
  * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
 
 	/* list of struct btrfs_work that are waiting for service */
 	struct list_head pending;
+	struct list_head prio_pending;
 
 	/* list of worker threads from struct btrfs_workers */
 	struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
 	spin_lock_irqsave(&workers->lock, flags);
 
-	while (!list_empty(&workers->order_list)) {
-		work = list_entry(workers->order_list.next,
-				  struct btrfs_work, order_list);
-
+	while (1) {
+		if (!list_empty(&workers->prio_order_list)) {
+			work = list_entry(workers->prio_order_list.next,
+					  struct btrfs_work, order_list);
+		} else if (!list_empty(&workers->order_list)) {
+			work = list_entry(workers->order_list.next,
+					  struct btrfs_work, order_list);
+		} else {
+			break;
+		}
 		if (!test_bit(WORK_DONE_BIT, &work->flags))
 			break;
 
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
 	do {
 		spin_lock_irq(&worker->lock);
 again_locked:
-		while (!list_empty(&worker->pending)) {
-			cur = worker->pending.next;
+		while (1) {
+			if (!list_empty(&worker->prio_pending))
+				cur = worker->prio_pending.next;
+			else if (!list_empty(&worker->pending))
+				cur = worker->pending.next;
+			else
+				break;
+
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
 			clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
 
 			spin_lock_irq(&worker->lock);
 			check_idle_worker(worker);
-
 		}
 		if (freezing(current)) {
 			worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
 				 * jump_in?
 				 */
 				smp_mb();
-				if (!list_empty(&worker->pending))
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
 					continue;
 
 				/*
@@ -191,7 +205,8 @@ again_locked:
 				 */
 				schedule_timeout(1);
 				smp_mb();
-				if (!list_empty(&worker->pending))
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
 					continue;
 
 				if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
 				/* still no more work?, sleep for real */
 				spin_lock_irq(&worker->lock);
 				set_current_state(TASK_INTERRUPTIBLE);
-				if (!list_empty(&worker->pending))
+				if (!list_empty(&worker->pending) ||
+				    !list_empty(&worker->prio_pending))
 					goto again_locked;
 
 				/*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 	INIT_LIST_HEAD(&workers->worker_list);
 	INIT_LIST_HEAD(&workers->idle_list);
 	INIT_LIST_HEAD(&workers->order_list);
+	INIT_LIST_HEAD(&workers->prio_order_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		}
 
 		INIT_LIST_HEAD(&worker->pending);
+		INIT_LIST_HEAD(&worker->prio_pending);
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
 		goto out;
 
 	spin_lock_irqsave(&worker->lock, flags);
-	list_add_tail(&work->list, &worker->pending);
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+		list_add_tail(&work->list, &worker->prio_pending);
+	else
+		list_add_tail(&work->list, &worker->pending);
 	atomic_inc(&worker->num_pending);
 
 	/* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
 	return 0;
 }
 
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
 /*
  * places a struct btrfs_work into the pending queue of one of the kthreads
  */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	worker = find_worker(workers);
 	if (workers->ordered) {
 		spin_lock_irqsave(&workers->lock, flags);
-		list_add_tail(&work->order_list, &workers->order_list);
+		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+			list_add_tail(&work->order_list,
+				      &workers->prio_order_list);
+		} else {
+			list_add_tail(&work->order_list, &workers->order_list);
+		}
 		spin_unlock_irqrestore(&workers->lock, flags);
 	} else {
 		INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	spin_lock_irqsave(&worker->lock, flags);
 
-	list_add_tail(&work->list, &worker->pending);
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+		list_add_tail(&work->list, &worker->prio_pending);
+	else
+		list_add_tail(&work->list, &worker->pending);
 	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
 
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed8b63e..1b511c109db6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
 	 * of work items waiting for completion
 	 */
 	struct list_head order_list;
+	struct list_head prio_order_list;
 
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e5b2533b691a..fedf8b9f03a2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 	int ret = 0;
 	int blocksize;
 
-	parent = path->nodes[level - 1];
+	parent = path->nodes[level + 1];
 	if (!parent)
 		return 0;
 
 	nritems = btrfs_header_nritems(parent);
-	slot = path->slots[level];
+	slot = path->slots[level + 1];
 	blocksize = btrfs_level_size(root, level);
 
 	if (slot > 0) {
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 			block1 = 0;
 		free_extent_buffer(eb);
 	}
-	if (slot < nritems) {
+	if (slot + 1 < nritems) {
 		block2 = btrfs_node_blockptr(parent, slot + 1);
 		gen = btrfs_node_ptr_generation(parent, slot + 1);
 		eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 	}
 	if (block1 || block2) {
 		ret = -EAGAIN;
+
+		/* release the whole path */
 		btrfs_release_path(root, path);
+
+		/* read the blocks */
 		if (block1)
 			readahead_tree_block(root, block1, blocksize, 0);
 		if (block2)
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 			eb = read_tree_block(root, block1, blocksize, 0);
 			free_extent_buffer(eb);
 		}
-		if (block1) {
+		if (block2) {
 			eb = read_tree_block(root, block2, blocksize, 0);
 			free_extent_buffer(eb);
 		}
@@ -1465,6 +1469,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	u32 blocksize;
 	struct extent_buffer *b = *eb_ret;
 	struct extent_buffer *tmp;
+	int ret;
 
 	blocknr = btrfs_node_blockptr(b, slot);
 	gen = btrfs_node_ptr_generation(b, slot);
@@ -1472,6 +1477,10 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 
 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
 	if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+		/*
+		 * we found an up to date block without sleeping, return
+		 * right away
+		 */
 		*eb_ret = tmp;
 		return 0;
 	}
@@ -1479,18 +1488,34 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	/*
 	 * reduce lock contention at high levels
 	 * of the btree by dropping locks before
-	 * we read.
+	 * we read.  Don't release the lock on the current
+	 * level because we need to walk this node to figure
+	 * out which blocks to read.
 	 */
-	btrfs_release_path(NULL, p);
+	btrfs_unlock_up_safe(p, level + 1);
+	btrfs_set_path_blocking(p);
+
 	if (tmp)
 		free_extent_buffer(tmp);
 	if (p->reada)
 		reada_for_search(root, p, level, slot, key->objectid);
 
+	btrfs_release_path(NULL, p);
+
+	ret = -EAGAIN;
 	tmp = read_tree_block(root, blocknr, blocksize, gen);
-	if (tmp)
+	if (tmp) {
+		/*
+		 * If the read above didn't mark this buffer up to date,
+		 * it will never end up being up to date.  Set ret to EIO now
+		 * and give up so that our caller doesn't loop forever
+		 * on our EAGAINs.
+		 */
+		if (!btrfs_buffer_uptodate(tmp, 0))
+			ret = -EIO;
 		free_extent_buffer(tmp);
-	return -EAGAIN;
+	}
+	return ret;
 }
 
 /*
@@ -1689,6 +1714,9 @@ cow_done:
 			if (ret == -EAGAIN)
 				goto again;
 
+			if (ret == -EIO)
+				goto done;
+
 			if (!p->skip_locking) {
 				int lret;
 
@@ -1731,6 +1759,8 @@ done:
 	 */
 	if (!p->leave_spinning)
 		btrfs_set_path_blocking(p);
+	if (ret < 0)
+		btrfs_release_path(root, p);
 	return ret;
 }
 
@@ -4205,6 +4235,11 @@ again:
 		if (ret == -EAGAIN)
 			goto again;
 
+		if (ret < 0) {
+			btrfs_release_path(root, path);
+			goto done;
+		}
+
 		if (!path->skip_locking) {
 			ret = btrfs_try_spin_lock(next);
 			if (!ret) {
@@ -4239,6 +4274,11 @@ again:
 		if (ret == -EAGAIN)
 			goto again;
 
+		if (ret < 0) {
+			btrfs_release_path(root, path);
+			goto done;
+		}
+
 		if (!path->skip_locking) {
 			btrfs_assert_tree_locked(path->nodes[level]);
 			ret = btrfs_try_spin_lock(next);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad96495dedc5..4414a5d9983a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -881,6 +881,9 @@ struct btrfs_fs_info {
 	u64 metadata_alloc_profile;
 	u64 system_alloc_profile;
 
+	unsigned data_chunk_allocations;
+	unsigned metadata_ratio;
+
 	void *bdev_holder;
 };
 
@@ -2174,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+		       u64 start, u64 end, u64 locked_end,
+		       u64 inline_limit, u64 *hint_block);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa8035f36..4b0ea0b80c23 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk(KERN_INFO "btrfs: %s checksum verify failed "
-			       "on %llu wanted %X found %X level %d\n",
-			       root->fs_info->sb->s_id,
-			       buf->start, val, found, btrfs_header_level(buf));
+			if (printk_ratelimit()) {
+				printk(KERN_INFO "btrfs: %s checksum verify "
+				       "failed on %llu wanted %X found %X "
+				       "level %d\n",
+				       root->fs_info->sb->s_id,
+				       (unsigned long long)buf->start, val, found,
+				       btrfs_header_level(buf));
+			}
 			if (result != (char *)&inline_result)
 				kfree(result);
 			return 1;
@@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		ret = 0;
 		goto out;
 	}
-	printk("parent transid verify failed on %llu wanted %llu found %llu\n",
-	       (unsigned long long)eb->start,
-	       (unsigned long long)parent_transid,
-	       (unsigned long long)btrfs_header_generation(eb));
+	if (printk_ratelimit()) {
+		printk("parent transid verify failed on %llu wanted %llu "
+		       "found %llu\n",
+		       (unsigned long long)eb->start,
+		       (unsigned long long)parent_transid,
+		       (unsigned long long)btrfs_header_generation(eb));
+	}
 	ret = 1;
 	clear_extent_buffer_uptodate(io_tree, eb);
 out:
@@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
-		       (unsigned long long)found_start,
-		       (unsigned long long)eb->start);
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "btrfs bad tree block start "
+			       "%llu %llu\n",
+			       (unsigned long long)found_start,
+			       (unsigned long long)eb->start);
+		}
 		ret = -EIO;
 		goto err;
 	}
@@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
-		printk(KERN_INFO "btrfs bad fsid on block %llu\n",
-		       (unsigned long long)eb->start);
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+			       (unsigned long long)eb->start);
+		}
 		ret = -EIO;
 		goto err;
 	}
@@ -579,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->bio_flags = bio_flags;
 
 	atomic_inc(&fs_info->nr_async_submits);
+
+	if (rw & (1 << BIO_RW_SYNCIO))
+		btrfs_set_work_high_prio(&async->work);
+
 	btrfs_queue_worker(&fs_info->workers, &async->work);
-#if 0
-	int limit = btrfs_async_submit_limit(fs_info);
-	if (atomic_read(&fs_info->nr_async_submits) > limit) {
-		wait_event_timeout(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_submits) < limit),
-			   HZ/10);
 
-		wait_event_timeout(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_bios) < limit),
-			   HZ/10);
-	}
-#endif
 	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
@@ -656,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 0);
 	}
+
 	/*
 	 * kthread helpers are used to submit writes so that checksumming
 	 * can happen in parallel across all CPUs
@@ -765,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	}
 }
 
-#if 0
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct buffer_head *bh;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct buffer_head *head;
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, root->fs_info->sb->s_blocksize,
-					(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	head = page_buffers(page);
-	bh = head;
-	do {
-		if (buffer_dirty(bh))
-			csum_tree_block(root, bh, 0);
-		bh = bh->b_this_page;
-	} while (bh != head);
-	return block_write_full_page(page, btree_get_block, wbc);
-}
-#endif
-
 static struct address_space_operations btree_aops = {
 	.readpage	= btree_readpage,
 	.writepage	= btree_writepage,
@@ -863,8 +848,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	if (ret == 0)
 		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
-	else
-		WARN_ON(1);
 	return buf;
 
 }
@@ -1273,11 +1256,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	int ret = 0;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
-#if 0
-	if ((bdi_bits & (1 << BDI_write_congested)) &&
-	    btrfs_congested_async(info, 0))
-		return 1;
-#endif
+
 	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
@@ -1599,6 +1578,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
+	fs_info->metadata_ratio = 8;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
@@ -1689,7 +1669,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (features) {
 		printk(KERN_ERR "BTRFS: couldn't mount because of "
 		       "unsupported optional features (%Lx).\n",
-		       features);
+		       (unsigned long long)features);
 		err = -EINVAL;
 		goto fail_iput;
 	}
@@ -1699,7 +1679,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
 		       "unsupported option features (%Lx).\n",
-		       features);
+		       (unsigned long long)features);
 		err = -EINVAL;
 		goto fail_iput;
 	}
@@ -2095,10 +2075,10 @@ static int write_dev_supers(struct btrfs_device *device,
 				device->barriers = 0;
 				get_bh(bh);
 				lock_buffer(bh);
-				ret = submit_bh(WRITE, bh);
+				ret = submit_bh(WRITE_SYNC, bh);
 			}
 		} else {
-			ret = submit_bh(WRITE, bh);
+			ret = submit_bh(WRITE_SYNC, bh);
 		}
 
 		if (!ret && wait) {
@@ -2291,7 +2271,7 @@ int close_ctree(struct btrfs_root *root)
 
 	if (fs_info->delalloc_bytes) {
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-		       fs_info->delalloc_bytes);
+		       (unsigned long long)fs_info->delalloc_bytes);
 	}
 	if (fs_info->total_ref_cache_size) {
 		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
@@ -2328,16 +2308,6 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
-#if 0
-	while (!list_empty(&fs_info->hashers)) {
-		struct btrfs_hasher *hasher;
-		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
-				    hashers);
-		list_del(&hasher->hashers);
-		crypto_free_hash(&fs_info->hash_tfm);
-		kfree(hasher);
-	}
-#endif
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 178df4c67de4..3e2c7c738f23 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -312,7 +312,7 @@ btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 }
 
 /*
- * return the block group that contains teh given bytenr
+ * return the block group that contains the given bytenr
  */
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
@@ -1844,10 +1844,14 @@ again:
 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
 		       ", %llu bytes_used, %llu bytes_reserved, "
 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
-		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
-		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
-		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
-		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+		       "%llu total\n", (unsigned long long)bytes,
+		       (unsigned long long)data_sinfo->bytes_delalloc,
+		       (unsigned long long)data_sinfo->bytes_used,
+		       (unsigned long long)data_sinfo->bytes_reserved,
+		       (unsigned long long)data_sinfo->bytes_pinned,
+		       (unsigned long long)data_sinfo->bytes_readonly,
+		       (unsigned long long)data_sinfo->bytes_may_use,
+		       (unsigned long long)data_sinfo->total_bytes);
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
@@ -1918,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 	spin_unlock(&info->lock);
 }
 
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+			found->force_alloc = 1;
+	}
+	rcu_read_unlock();
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
 	u64 thresh;
 	int ret = 0;
 
-	mutex_lock(&extent_root->fs_info->chunk_mutex);
+	mutex_lock(&fs_info->chunk_mutex);
 
 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
 
@@ -1958,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&space_info->lock);
 
+	/*
+	 * if we're doing a data chunk, go ahead and make sure that
+	 * we keep a reasonable number of metadata chunks allocated in the
+	 * FS as well.
+	 */
+	if (flags & BTRFS_BLOCK_GROUP_DATA) {
+		fs_info->data_chunk_allocations++;
+		if (!(fs_info->data_chunk_allocations %
+		      fs_info->metadata_ratio))
+			force_metadata_allocation(fs_info);
+	}
+
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
 	if (ret)
 		space_info->full = 1;
@@ -2798,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 				    info->bytes_pinned - info->bytes_reserved),
 	       (info->full) ? "" : "not ");
 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-	       " may_use=%llu, used=%llu\n", info->total_bytes,
-	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
-	       info->bytes_used);
+	       " may_use=%llu, used=%llu\n",
+	       (unsigned long long)info->total_bytes,
+	       (unsigned long long)info->bytes_pinned,
+	       (unsigned long long)info->bytes_delalloc,
+	       (unsigned long long)info->bytes_may_use,
+	       (unsigned long long)info->bytes_used);
 
 	down_read(&info->groups_sem);
 	list_for_each_entry(cache, &info->block_groups, list) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb2bee8b7fbf..fe9eb990e443 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,12 +17,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-				       unsigned long extra_flags,
-				       void (*ctor)(void *, struct kmem_cache *,
-						    unsigned long));
-
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
 
@@ -50,20 +44,23 @@ struct extent_page_data {
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
 	 */
-	int extent_locked;
+	unsigned int extent_locked:1;
+
+	/* tells the submit_bio code to use a WRITE_SYNC */
+	unsigned int sync_io:1;
 };
 
 int __init extent_io_init(void)
 {
-	extent_state_cache = btrfs_cache_create("extent_state",
-					    sizeof(struct extent_state), 0,
-					    NULL);
+	extent_state_cache = kmem_cache_create("extent_state",
+			sizeof(struct extent_state), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_state_cache)
 		return -ENOMEM;
 
-	extent_buffer_cache = btrfs_cache_create("extent_buffers",
-					    sizeof(struct extent_buffer), 0,
-					    NULL);
+	extent_buffer_cache = kmem_cache_create("extent_buffers",
+			sizeof(struct extent_buffer), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
 		goto free_state_cache;
 	return 0;
@@ -1404,69 +1401,6 @@ out:
 	return total_bytes;
 }
 
-#if 0
-/*
- * helper function to lock both pages and extents in the tree.
- * pages must be locked first.
- */
-static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-	int err;
-
-	while (index <= end_index) {
-		page = grab_cache_page(tree->mapping, index);
-		if (!page) {
-			err = -ENOMEM;
-			goto failed;
-		}
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto failed;
-		}
-		index++;
-	}
-	lock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-
-failed:
-	/*
-	 * we failed above in getting the page at 'index', so we undo here
-	 * up to but not including the page at 'index'
-	 */
-	end_index = index;
-	index = start >> PAGE_CACHE_SHIFT;
-	while (index < end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	return err;
-}
-
-/*
- * helper function to unlock both pages and extents in the tree.
- */
-static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	unlock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-}
-#endif
-
 /*
  * set the private field for a given byte offset in the tree.  If there isn't
  * an extent_state there already, this does nothing.
@@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	return ret;
 }
 
+static noinline void update_nr_written(struct page *page,
+				      struct writeback_control *wbc,
+				      unsigned long nr_written)
+{
+	wbc->nr_to_write -= nr_written;
+	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+		page->mapping->writeback_index = page->index + nr_written;
+}
+
 /*
  * the writepage semantics are similar to regular writepage.  extent
  * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 delalloc_end;
 	int page_started;
 	int compressed;
+	int write_flags;
 	unsigned long nr_written = 0;
 
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		write_flags = WRITE_SYNC_PLUG;
+	else
+		write_flags = WRITE;
+
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
@@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
+		/*
+		 * make sure the wbc mapping index is at least updated
+		 * to this page.
+		 */
+		update_nr_written(page, wbc, 0);
+
 		while (delalloc_end < page_end) {
 			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
@@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		 */
 		if (page_started) {
 			ret = 0;
-			goto update_nr_written;
+			/*
+			 * we've unlocked the page, so we can't update
+			 * the mapping's writeback index, just update
+			 * nr_to_write.
+			 */
+			wbc->nr_to_write -= nr_written;
+			goto done_unlocked;
 		}
 	}
 	lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (ret == -EAGAIN) {
 			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
+			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
 			ret = 0;
-			goto update_nr_written;
+			goto done_unlocked;
 		}
 	}
 
-	nr_written++;
+	/*
+	 * we don't want to touch the inode after unlocking the page,
+	 * so we update the mapping writeback index now
+	 */
+	update_nr_written(page, wbc, nr_written + 1);
 
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 				       (unsigned long long)end);
 			}
 
-			ret = submit_extent_page(WRITE, tree, page, sector,
-						 iosize, pg_offset, bdev,
-						 &epd->bio, max_nr,
+			ret = submit_extent_page(write_flags, tree, page,
+						 sector, iosize, pg_offset,
+						 bdev, &epd->bio, max_nr,
 						 end_bio_extent_writepage,
 						 0, 0, 0);
 			if (ret)
@@ -2336,11 +2303,8 @@ done:
 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
 
-update_nr_written:
-	wbc->nr_to_write -= nr_written;
-	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-		page->mapping->writeback_index = page->index + nr_written;
+done_unlocked:
+
 	return 0;
 }
 
@@ -2460,15 +2424,23 @@ retry:
 	return ret;
 }
 
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
 {
-	struct extent_page_data *epd = data;
 	if (epd->bio) {
-		submit_one_bio(WRITE, epd->bio, 0, 0);
+		if (epd->sync_io)
+			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+		else
+			submit_one_bio(WRITE, epd->bio, 0, 0);
 		epd->bio = NULL;
 	}
 }
 
+static noinline void flush_write_bio(void *data)
+{
+	struct extent_page_data *epd = data;
+	flush_epd_write_bio(epd);
+}
+
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc)
@@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= wbc->bdi,
-		.sync_mode	= WB_SYNC_NONE,
+		.sync_mode	= wbc->sync_mode,
 		.older_than_this = NULL,
 		.nr_to_write	= 64,
 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
 		.range_end	= (loff_t)-1,
 	};
 
-
 	ret = __extent_writepage(page, wbc, &epd);
 
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd, flush_write_bio);
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
@@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 1,
+		.sync_io = mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= inode->i_mapping->backing_dev_info,
@@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		start += PAGE_CACHE_SIZE;
 	}
 
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
@@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree,
 		.tree = tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd,
 				       flush_write_bio);
-	if (epd.bio)
-		submit_one_bio(WRITE, epd.bio, 0, 0);
+	flush_epd_write_bio(&epd);
 	return ret;
 }
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b187917b36fa..30c9365861e6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -6,19 +6,14 @@
 #include <linux/hardirq.h>
 #include "extent_map.h"
 
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-				       unsigned long extra_flags,
-				       void (*ctor)(void *, struct kmem_cache *,
-						    unsigned long));
 
 static struct kmem_cache *extent_map_cache;
 
 int __init extent_map_init(void)
 {
-	extent_map_cache = btrfs_cache_create("extent_map",
-					    sizeof(struct extent_map), 0,
-					    NULL);
+	extent_map_cache = kmem_cache_create("extent_map",
+			sizeof(struct extent_map), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_map_cache)
 		return -ENOMEM;
 	return 0;
@@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 	tree->map.rb_node = NULL;
 	spin_lock_init(&tree->lock);
 }
-EXPORT_SYMBOL(extent_map_tree_init);
 
 /**
  * alloc_extent_map - allocate new extent map structure
@@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 	atomic_set(&em->refs, 1);
 	return em;
 }
-EXPORT_SYMBOL(alloc_extent_map);
 
 /**
  * free_extent_map - drop reference count of an extent_map
@@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em)
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
-EXPORT_SYMBOL(free_extent_map);
 
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
@@ -264,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 out:
 	return ret;
 }
-EXPORT_SYMBOL(add_extent_mapping);
 
 /* simple helper to do math around the end of an extent, handling wrap */
 static u64 range_end(u64 start, u64 len)
@@ -326,7 +317,6 @@ found:
 out:
 	return em;
 }
-EXPORT_SYMBOL(lookup_extent_mapping);
 
 /**
  * remove_extent_mapping - removes an extent_map from the extent tree
@@ -346,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	em->in_tree = 0;
 	return ret;
 }
-EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c9fb46ccd08..1d51dc38bb49 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
-{
-	return 0;
-#if 0
-	struct btrfs_path *path;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	struct btrfs_file_extent_item *extent;
-	u64 last_offset = 0;
-	int nritems;
-	int slot;
-	int found_type;
-	int ret;
-	int err = 0;
-	u64 extent_end = 0;
-
-	path = btrfs_alloc_path();
-	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
-				       last_offset, 0);
-	while (1) {
-		nritems = btrfs_header_nritems(path->nodes[0]);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				goto out;
-			nritems = btrfs_header_nritems(path->nodes[0]);
-		}
-		slot = path->slots[0];
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (found_key.objectid != inode->i_ino)
-			break;
-		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-			goto out;
-
-		if (found_key.offset < last_offset) {
-			WARN_ON(1);
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_ERR "inode %lu found offset %llu "
-			       "expected %llu\n", inode->i_ino,
-			       (unsigned long long)found_key.offset,
-			       (unsigned long long)last_offset);
-			err = 1;
-			goto out;
-		}
-		extent = btrfs_item_ptr(leaf, slot,
-					struct btrfs_file_extent_item);
-		found_type = btrfs_file_extent_type(leaf, extent);
-		if (found_type == BTRFS_FILE_EXTENT_REG) {
-			extent_end = found_key.offset +
-			     btrfs_file_extent_num_bytes(leaf, extent);
-		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-			struct btrfs_item *item;
-			item = btrfs_item_nr(leaf, slot);
-			extent_end = found_key.offset +
-			     btrfs_file_extent_inline_len(leaf, extent);
-			extent_end = (extent_end + root->sectorsize - 1) &
-				~((u64)root->sectorsize - 1);
-		}
-		last_offset = extent_end;
-		path->slots[0]++;
-	}
-	if (0 && last_offset < inode->i_size) {
-		WARN_ON(1);
-		btrfs_print_leaf(root, leaf);
-		printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
-		       inode->i_ino, (unsigned long long)last_offset,
-		       (unsigned long long)inode->i_size);
-		err = 1;
-
-	}
-out:
-	btrfs_free_path(path);
-	return err;
-#endif
-}
-
 /*
  * this is very complex, but the basic idea is to drop all extents
  * in the range start - end.  hint_block is filled in with a block number
@@ -363,15 +286,16 @@ out:
  */
 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+		       u64 start, u64 end, u64 locked_end,
+		       u64 inline_limit, u64 *hint_byte)
 {
 	u64 extent_end = 0;
-	u64 locked_end = end;
 	u64 search_start = start;
 	u64 leaf_start;
 	u64 ram_bytes = 0;
 	u64 orig_parent = 0;
 	u64 disk_bytenr = 0;
+	u64 orig_locked_end = locked_end;
 	u8 compression;
 	u8 encryption;
 	u16 other_encoding = 0;
@@ -684,11 +608,10 @@ next_slot:
 	}
 out:
 	btrfs_free_path(path);
-	if (locked_end > end) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
-			      GFP_NOFS);
+	if (locked_end > orig_locked_end) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
+			      locked_end - 1, GFP_NOFS);
 	}
-	btrfs_check_file(root, inode);
 	return ret;
 }
 
@@ -830,7 +753,7 @@ again:
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		BUG_ON(ret);
-		goto done;
+		goto release;
 	} else if (split == start) {
 		if (locked_end < extent_end) {
 			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -926,6 +849,8 @@ again:
 	}
 done:
 	btrfs_mark_buffer_dirty(leaf);
+
+release:
 	btrfs_release_path(root, path);
 	if (split_end && split == start) {
 		split = end;
@@ -1131,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		if (will_write) {
 			btrfs_fdatawrite_range(inode->i_mapping, pos,
 					       pos + write_bytes - 1,
-					       WB_SYNC_NONE);
+					       WB_SYNC_ALL);
 		} else {
 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
 							   num_pages);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 768b9523662d..0bc93657b460 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			printk(KERN_ERR "couldn't find space %llu to free\n",
 			       (unsigned long long)offset);
 			printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-			       block_group->cached, block_group->key.objectid,
-			       block_group->key.offset);
+			       block_group->cached,
+			       (unsigned long long)block_group->key.objectid,
+			       (unsigned long long)block_group->key.offset);
 			btrfs_dump_free_space(block_group, bytes);
 		} else if (info) {
 			printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
 			       "but wanted offset=%llu bytes=%llu\n",
-			       info->offset, info->bytes, offset, bytes);
+			       (unsigned long long)info->offset,
+			       (unsigned long long)info->bytes,
+			       (unsigned long long)offset,
+			       (unsigned long long)bytes);
 		}
 		WARN_ON(1);
 	}
@@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes)
 			count++;
-		printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
-		       info->bytes);
+		printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+		       (unsigned long long)info->offset,
+		       (unsigned long long)info->bytes);
 	}
 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
 	       "\n", count);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc7334d833c9..9abbced1123d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	}
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+	search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
 	search_key.type = 0;
 	search_key.offset = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0d1dd492a58..1c8b0190d031 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops;
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
-struct kmem_cache *btrfs_bit_radix_cachep;
 struct kmem_cache *btrfs_path_cachep;
 
 #define S_SHIFT 12
@@ -234,7 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, start, &hint_byte);
+				 aligned_end, aligned_end, start, &hint_byte);
 	BUG_ON(ret);
 
 	if (isize > actual_end)
@@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 				       struct inode *inode, u64 file_pos,
 				       u64 disk_bytenr, u64 disk_num_bytes,
 				       u64 num_bytes, u64 ram_bytes,
+				       u64 locked_end,
 				       u8 compression, u8 encryption,
 				       u16 other_encoding, int extent_type)
 {
@@ -1455,7 +1455,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 
 	path->leave_spinning = 1;
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
-				 file_pos + num_bytes, file_pos, &hint);
+				 file_pos + num_bytes, locked_end,
+				 file_pos, &hint);
 	BUG_ON(ret);
 
 	ins.objectid = inode->i_ino;
@@ -1590,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->disk_len,
 						ordered_extent->len,
 						ordered_extent->len,
+						ordered_extent->file_offset +
+						ordered_extent->len,
 						compressed, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
 		BUG_ON(ret);
@@ -1819,10 +1822,12 @@ good:
 	return 0;
 
 zeroit:
-	printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
-	       "private %llu\n", page->mapping->host->i_ino,
-	       (unsigned long long)start, csum,
-	       (unsigned long long)private);
+	if (printk_ratelimit()) {
+		printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+		       "private %llu\n", page->mapping->host->i_ino,
+		       (unsigned long long)start, csum,
+		       (unsigned long long)private);
+	}
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_USER0);
@@ -2011,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 }
 
 /*
+ * very simple check to peek ahead in the leaf looking for xattrs.  If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+					  int slot, u64 objectid)
+{
+	u32 nritems = btrfs_header_nritems(leaf);
+	struct btrfs_key found_key;
+	int scanned = 0;
+
+	slot++;
+	while (slot < nritems) {
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* we found a different objectid, there must not be acls */
+		if (found_key.objectid != objectid)
+			return 0;
+
+		/* we found an xattr, assume we've got an acl */
+		if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+			return 1;
+
+		/*
+		 * we found a key greater than an xattr key, there can't
+		 * be any acls later on
+		 */
+		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+			return 0;
+
+		slot++;
+		scanned++;
+
+		/*
+		 * it goes inode, inode backrefs, xattrs, extents,
+		 * so if there are a ton of hard links to an inode there can
+		 * be a lot of backrefs.  Don't waste time searching too hard,
+		 * this is just an optimization
+		 */
+		if (scanned >= 8)
+			break;
+	}
+	/* we hit the end of the leaf before we found an xattr or
+	 * something larger than an xattr.  We have to assume the inode
+	 * has acls
+	 */
+	return 1;
+}
+
+/*
  * read an inode from the btree into the in-memory inode
  */
 void btrfs_read_locked_inode(struct inode *inode)
@@ -2021,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
+	int maybe_acls;
 	u64 alloc_group_block;
 	u32 rdev;
 	int ret;
@@ -2067,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
 
+	/*
+	 * try to precache a NULL acl entry for files that don't have
+	 * any xattrs or acls
+	 */
+	maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+	if (!maybe_acls) {
+		BTRFS_I(inode)->i_acl = NULL;
+		BTRFS_I(inode)->i_default_acl = NULL;
+	}
+
 	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
 						alloc_group_block, 0);
 	btrfs_free_path(path);
@@ -2877,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 			err = btrfs_drop_extents(trans, root, inode,
 						 cur_offset,
 						 cur_offset + hole_size,
+						 block_end,
 						 cur_offset, &hint_byte);
 			if (err)
 				break;
@@ -3041,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode)
 {
 	struct btrfs_inode *bi = BTRFS_I(inode);
 
-	bi->i_acl = NULL;
-	bi->i_default_acl = NULL;
+	bi->i_acl = BTRFS_ACL_NOT_CACHED;
+	bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
 
 	bi->generation = 0;
 	bi->sequence = 0;
@@ -3054,6 +3122,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
 	bi->last_unlink_trans = 0;
+	bi->ordered_data_close = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -4227,7 +4296,6 @@ out:
 	}
 	if (err) {
 		free_extent_map(em);
-		WARN_ON(1);
 		return ERR_PTR(err);
 	}
 	return em;
@@ -4634,47 +4702,36 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_trans_handle_cachep);
 	if (btrfs_transaction_cachep)
 		kmem_cache_destroy(btrfs_transaction_cachep);
-	if (btrfs_bit_radix_cachep)
-		kmem_cache_destroy(btrfs_bit_radix_cachep);
 	if (btrfs_path_cachep)
 		kmem_cache_destroy(btrfs_path_cachep);
 }
 
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-				       unsigned long extra_flags,
-				       void (*ctor)(void *))
-{
-	return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
-				 SLAB_MEM_SPREAD | extra_flags), ctor);
-}
-
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
-					  sizeof(struct btrfs_inode),
-					  0, init_once);
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+			sizeof(struct btrfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
-	btrfs_trans_handle_cachep =
-			btrfs_cache_create("btrfs_trans_handle_cache",
-					   sizeof(struct btrfs_trans_handle),
-					   0, NULL);
+
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+			sizeof(struct btrfs_trans_handle), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
-	btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
-					     sizeof(struct btrfs_transaction),
-					     0, NULL);
+
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+			sizeof(struct btrfs_transaction), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
-	btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
-					 sizeof(struct btrfs_path),
-					 0, NULL);
+
+	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+			sizeof(struct btrfs_path), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
-	btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
-					      SLAB_DESTROY_BY_RCU, NULL);
-	if (!btrfs_bit_radix_cachep)
-		goto fail;
+
 	return 0;
 fail:
 	btrfs_destroy_cachep();
@@ -4970,10 +5027,10 @@ out_fail:
 	return err;
 }
 
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
-			       u64 alloc_hint, int mode)
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
+			       struct inode *inode, u64 start, u64 end,
+			       u64 locked_end, u64 alloc_hint, int mode)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 alloc_size;
@@ -4981,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 	u64 num_bytes = end - start;
 	int ret = 0;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
-	btrfs_set_trans_block_group(trans, inode);
-
 	while (num_bytes > 0) {
 		alloc_size = min(num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -4997,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 		ret = insert_reserved_file_extent(trans, inode,
 						  cur_offset, ins.objectid,
 						  ins.offset, ins.offset,
-						  ins.offset, 0, 0, 0,
+						  ins.offset, locked_end,
+						  0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		BUG_ON(ret);
 		num_bytes -= ins.offset;
@@ -5015,7 +5069,6 @@ out:
 		BUG_ON(ret);
 	}
 
-	btrfs_end_transaction(trans, root);
 	return ret;
 }
 
@@ -5027,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	u64 alloc_start;
 	u64 alloc_end;
 	u64 alloc_hint = 0;
+	u64 locked_end;
 	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 	struct extent_map *em;
+	struct btrfs_trans_handle *trans;
 	int ret;
 
 	alloc_start = offset & ~mask;
 	alloc_end =  (offset + len + mask) & ~mask;
 
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
 	mutex_lock(&inode->i_mutex);
 	if (alloc_start > inode->i_size) {
 		ret = btrfs_cont_expand(inode, alloc_start);
@@ -5041,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			goto out;
 	}
 
+	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
-			    alloc_end - 1, GFP_NOFS);
+
+		trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+		if (!trans) {
+			ret = -EIO;
+			goto out;
+		}
+
+		/* the extent lock is ordered inside the running
+		 * transaction
+		 */
+		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			    GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
@@ -5052,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		    ordered->file_offset < alloc_end) {
 			btrfs_put_ordered_extent(ordered);
 			unlock_extent(&BTRFS_I(inode)->io_tree,
-				      alloc_start, alloc_end - 1, GFP_NOFS);
+				      alloc_start, locked_end, GFP_NOFS);
+			btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+
+			/*
+			 * we can't wait on the range with the transaction
+			 * running or with the extent lock held
+			 */
 			btrfs_wait_ordered_range(inode, alloc_start,
 						 alloc_end - alloc_start);
 		} else {
@@ -5070,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		last_byte = min(extent_map_end(em), alloc_end);
 		last_byte = (last_byte + mask) & ~mask;
 		if (em->block_start == EXTENT_MAP_HOLE) {
-			ret = prealloc_file_range(inode, cur_offset,
-					last_byte, alloc_hint, mode);
+			ret = prealloc_file_range(trans, inode, cur_offset,
+					last_byte, locked_end + 1,
+					alloc_hint, mode);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
@@ -5087,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			break;
 		}
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 		      GFP_NOFS);
+
+	btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7594bec1be10..2624b53ea783 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -437,10 +437,6 @@ out_unlock:
 	return 0;
 }
 
-/*
- * Called inside transaction, so use GFP_NOFS
- */
-
 static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 {
 	u64 new_size;
@@ -461,15 +457,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	namelen = strlen(vol_args->name);
@@ -483,11 +473,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		*devstr = '\0';
 		devstr = vol_args->name;
 		devid = simple_strtoull(devstr, &end, 10);
-		printk(KERN_INFO "resizing devid %llu\n", devid);
+		printk(KERN_INFO "resizing devid %llu\n",
+		       (unsigned long long)devid);
 	}
 	device = btrfs_find_device(root, devid, NULL, NULL);
 	if (!device) {
-		printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+		printk(KERN_INFO "resizer unable to find device %llu\n",
+		       (unsigned long long)devid);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
@@ -545,7 +537,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 
 out_unlock:
 	mutex_unlock(&root->fs_info->volume_mutex);
-out:
 	kfree(vol_args);
 	return ret;
 }
@@ -565,15 +556,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	namelen = strlen(vol_args->name);
@@ -675,19 +660,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
-out:
 	kfree(vol_args);
 	return ret;
 }
@@ -703,19 +682,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
+	vol_args = memdup_user(arg, sizeof(*vol_args));
+	if (IS_ERR(vol_args))
+		return PTR_ERR(vol_args);
 
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_rm_device(root, vol_args->name);
 
-out:
 	kfree(vol_args);
 	return ret;
 }
@@ -830,7 +803,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	BUG_ON(!trans);
 
 	/* punch hole in destination first */
-	btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+	btrfs_drop_extents(trans, root, inode, off, off + len,
+			   off + len, 0, &hint_byte);
 
 	/* clone data */
 	key.objectid = src->i_ino;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 53c87b197d70..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
 
 	/* The compression code will leave pages locked but return from
 	 * writepage without setting the page writeback.  Starting again
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9744af9d71e9..2ff7cd2db25f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
 	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
-	Opt_flushoncommit, Opt_err,
+	Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -87,6 +87,7 @@ static match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
+	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_err, NULL},
 };
 
@@ -195,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				info->max_extent = max_t(u64,
 					info->max_extent, root->sectorsize);
 				printk(KERN_INFO "btrfs: max_extent at %llu\n",
-				       info->max_extent);
+				       (unsigned long long)info->max_extent);
 			}
 			break;
 		case Opt_max_inline:
@@ -210,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 						root->sectorsize);
 				}
 				printk(KERN_INFO "btrfs: max_inline at %llu\n",
-					info->max_inline);
+					(unsigned long long)info->max_inline);
 			}
 			break;
 		case Opt_alloc_start:
@@ -220,7 +221,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				kfree(num);
 				printk(KERN_INFO
 					"btrfs: allocations start at %llu\n",
-					info->alloc_start);
+					(unsigned long long)info->alloc_start);
 			}
 			break;
 		case Opt_noacl:
@@ -234,6 +235,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
 			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
 			break;
+		case Opt_ratio:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->metadata_ratio = intarg;
+				printk(KERN_INFO "btrfs: metadata ratio %d\n",
+				       info->metadata_ratio);
+			}
+			break;
 		default:
 			break;
 		}
@@ -410,11 +420,14 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (btrfs_test_opt(root, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
 	if (info->max_extent != (u64)-1)
-		seq_printf(seq, ",max_extent=%llu", info->max_extent);
+		seq_printf(seq, ",max_extent=%llu",
+			   (unsigned long long)info->max_extent);
 	if (info->max_inline != 8192 * 1024)
-		seq_printf(seq, ",max_inline=%llu", info->max_inline);
+		seq_printf(seq, ",max_inline=%llu",
+			   (unsigned long long)info->max_inline);
 	if (info->alloc_start != 0)
-		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+		seq_printf(seq, ",alloc_start=%llu",
+			   (unsigned long long)info->alloc_start);
 	if (info->thread_pool_size !=  min_t(unsigned long,
 					     num_online_cpus() + 2, 8))
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -423,9 +436,9 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (btrfs_test_opt(root, SSD))
 		seq_puts(seq, ",ssd");
 	if (btrfs_test_opt(root, NOTREELOG))
-		seq_puts(seq, ",no-treelog");
+		seq_puts(seq, ",notreelog");
 	if (btrfs_test_opt(root, FLUSHONCOMMIT))
-		seq_puts(seq, ",flush-on-commit");
+		seq_puts(seq, ",flushoncommit");
 	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
 		seq_puts(seq, ",noacl");
 	return 0;
@@ -489,8 +502,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			error = -EBUSY;
 			goto error_close_devices;
 		}
@@ -504,8 +516,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			goto error_free_subvol_name;
 		}
 
@@ -522,15 +533,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		mutex_unlock(&s->s_root->d_inode->i_mutex);
 
 		if (IS_ERR(root)) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			error = PTR_ERR(root);
 			goto error_free_subvol_name;
 		}
 		if (!root->d_inode) {
 			dput(root);
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			error = -ENXIO;
 			goto error_free_subvol_name;
 		}
@@ -635,14 +644,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
-	if (!vol)
-		return -ENOMEM;
-
-	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
-		ret = -EFAULT;
-		goto out;
-	}
+	vol = memdup_user((void __user *)arg, sizeof(*vol));
+	if (IS_ERR(vol))
+		return PTR_ERR(vol);
 
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
@@ -650,7 +654,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 					    &btrfs_fs_type, &fs_devices);
 		break;
 	}
-out:
+
 	kfree(vol);
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2869b3361eb6..01b143605ec1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -687,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 		prepare_to_wait(&info->transaction_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&info->trans_mutex);
+
+		atomic_dec(&info->throttles);
+		wake_up(&info->transaction_throttle);
+
 		schedule();
+
+		atomic_inc(&info->throttles);
 		mutex_lock(&info->trans_mutex);
 		finish_wait(&info->transaction_wait, &wait);
 	}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 25f20ea11f27..db5e212e8445 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -536,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode,
-			 start, extent_end, start, &alloc_hint);
+			 start, extent_end, extent_end, start, &alloc_hint);
 	BUG_ON(ret);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0913e469728..5f01dad4b696 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+			struct bio *head, struct bio *tail)
+{
+
+	struct bio *old_head;
+
+	old_head = pending_bios->head;
+	pending_bios->head = head;
+	if (pending_bios->tail)
+		tail->bi_next = old_head;
+	else
+		pending_bios->tail = tail;
+}
+
 /*
  * we try to collect pending bios for a device so we don't get a large
  * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	struct bio *pending;
 	struct backing_dev_info *bdi;
 	struct btrfs_fs_info *fs_info;
+	struct btrfs_pending_bios *pending_bios;
 	struct bio *tail;
 	struct bio *cur;
 	int again = 0;
-	unsigned long num_run = 0;
+	unsigned long num_run;
+	unsigned long num_sync_run;
 	unsigned long limit;
 	unsigned long last_waited = 0;
 
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
+	/* we want to make sure that every time we switch from the sync
+	 * list to the normal list, we unplug
+	 */
+	num_sync_run = 0;
+
 loop:
 	spin_lock(&device->io_lock);
+	num_run = 0;
 
 loop_lock:
+
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
 	 * into the list if we hit congestion
 	 */
-	pending = device->pending_bios;
-	tail = device->pending_bio_tail;
+	if (device->pending_sync_bios.head)
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
+
+	pending = pending_bios->head;
+	tail = pending_bios->tail;
 	WARN_ON(pending && !tail);
-	device->pending_bios = NULL;
-	device->pending_bio_tail = NULL;
 
 	/*
 	 * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
 	 * device->running_pending is used to synchronize with the
 	 * schedule_bio code.
 	 */
-	if (pending) {
-		again = 1;
-		device->running_pending = 1;
-	} else {
+	if (device->pending_sync_bios.head == NULL &&
+	    device->pending_bios.head == NULL) {
 		again = 0;
 		device->running_pending = 0;
+	} else {
+		again = 1;
+		device->running_pending = 1;
 	}
+
+	pending_bios->head = NULL;
+	pending_bios->tail = NULL;
+
 	spin_unlock(&device->io_lock);
 
+	/*
+	 * if we're doing the regular priority list, make sure we unplug
+	 * for any high prio bios we've sent down
+	 */
+	if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+		num_sync_run = 0;
+		blk_run_backing_dev(bdi, NULL);
+	}
+
 	while (pending) {
+
+		rmb();
+		if (pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head &&
+		    num_run > 16) {
+			cond_resched();
+			spin_lock(&device->io_lock);
+			requeue_list(pending_bios, pending, tail);
+			goto loop_lock;
+		}
+
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
 			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-		bio_get(cur);
 		submit_bio(cur->bi_rw, cur);
-		bio_put(cur);
 		num_run++;
+		if (bio_sync(cur))
+			num_sync_run++;
+
+		if (need_resched()) {
+			if (num_sync_run) {
+				blk_run_backing_dev(bdi, NULL);
+				num_sync_run = 0;
+			}
+			cond_resched();
+		}
 
 		/*
 		 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
 		 */
 		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
-			struct bio *old_head;
 			struct io_context *ioc;
 
 			ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
 				 * against it before looping
 				 */
 				last_waited = ioc->last_waited;
+				if (need_resched()) {
+					if (num_sync_run) {
+						blk_run_backing_dev(bdi, NULL);
+						num_sync_run = 0;
+					}
+					cond_resched();
+				}
 				continue;
 			}
 			spin_lock(&device->io_lock);
-
-			old_head = device->pending_bios;
-			device->pending_bios = pending;
-			if (device->pending_bio_tail)
-				tail->bi_next = old_head;
-			else
-				device->pending_bio_tail = tail;
-
+			requeue_list(pending_bios, pending, tail);
 			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
 			goto done;
 		}
 	}
+
+	if (num_sync_run) {
+		num_sync_run = 0;
+		blk_run_backing_dev(bdi, NULL);
+	}
+
+	cond_resched();
 	if (again)
 		goto loop;
 
 	spin_lock(&device->io_lock);
-	if (device->pending_bios)
+	if (device->pending_bios.head || device->pending_sync_bios.head)
 		goto loop_lock;
 	spin_unlock(&device->io_lock);
 
@@ -1478,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -1875,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	device->total_bytes = new_size;
 	if (device->writeable)
 		device->fs_devices->total_rw_bytes -= diff;
-	ret = btrfs_update_device(trans, device);
-	if (ret) {
-		unlock_chunks(root);
-		btrfs_end_transaction(trans, root);
-		goto done;
-	}
-	WARN_ON(diff > old_total);
-	btrfs_set_super_total_bytes(super_copy, old_total - diff);
 	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 
@@ -1914,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 		length = btrfs_dev_extent_length(l, dev_extent);
 
 		if (key.offset + length <= new_size)
-			goto done;
+			break;
 
 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1927,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 			goto done;
 	}
 
+	/* Shrinking succeeded, else we would be at "done". */
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	lock_chunks(root);
+
+	device->disk_total_bytes = new_size;
+	/* Now btrfs_update_device() will change the on-disk size. */
+	ret = btrfs_update_device(trans, device);
+	if (ret) {
+		unlock_chunks(root);
+		btrfs_end_transaction(trans, root);
+		goto done;
+	}
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
 done:
 	btrfs_free_path(path);
 	return ret;
@@ -2497,7 +2574,7 @@ again:
 			max_errors = 1;
 		}
 	}
-	if (multi_ret && rw == WRITE &&
+	if (multi_ret && (rw & (1 << BIO_RW)) &&
 	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
@@ -2762,6 +2839,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 				 int rw, struct bio *bio)
 {
 	int should_queue = 1;
+	struct btrfs_pending_bios *pending_bios;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2861,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
+	if (bio_sync(bio))
+		pending_bios = &device->pending_sync_bios;
+	else
+		pending_bios = &device->pending_bios;
 
-	if (device->pending_bio_tail)
-		device->pending_bio_tail->bi_next = bio;
+	if (pending_bios->tail)
+		pending_bios->tail->bi_next = bio;
 
-	device->pending_bio_tail = bio;
-	if (!device->pending_bios)
-		device->pending_bios = bio;
+	pending_bios->tail = bio;
+	if (!pending_bios->head)
+		pending_bios->head = bio;
 	if (device->running_pending)
 		should_queue = 0;
 
@@ -3006,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	unsigned long ptr;
 
 	device->devid = btrfs_device_id(leaf, dev_item);
-	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->total_bytes = device->disk_total_bytes;
 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
 	device->type = btrfs_device_type(leaf, dev_item);
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2185de72ff7d..5c3ff6d02fd7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
 #include "async-thread.h"
 
 struct buffer_head;
+struct btrfs_pending_bios {
+	struct bio *head;
+	struct bio *tail;
+};
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct list_head dev_alloc_list;
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
-	struct bio *pending_bios;
-	struct bio *pending_bio_tail;
+
+	/* regular prio bios */
+	struct btrfs_pending_bios pending_bios;
+	/* WRITE_SYNC bios */
+	struct btrfs_pending_bios pending_sync_bios;
+
 	int running_pending;
 	u64 generation;
 
@@ -52,6 +61,9 @@ struct btrfs_device {
 	/* size of the device */
 	u64 total_bytes;
 
+	/* size of the disk */
+	u64 disk_total_bytes;
+
 	/* bytes used */
 	u64 bytes_used;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a896ff78..aed297739eb0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,7 +360,7 @@ still_busy:
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
-static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
 	set_buffer_async_read(bh);
 }
 
-void mark_buffer_async_write(struct buffer_head *bh)
+void mark_buffer_async_write_endio(struct buffer_head *bh,
+				   bh_end_io_t *handler)
 {
-	bh->b_end_io = end_buffer_async_write;
+	bh->b_end_io = handler;
 	set_buffer_async_write(bh);
 }
+
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+	mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
 EXPORT_SYMBOL(mark_buffer_async_write);
 
 
@@ -547,7 +553,7 @@ repeat:
 	return err;
 }
 
-void do_thaw_all(unsigned long unused)
+void do_thaw_all(struct work_struct *work)
 {
 	struct super_block *sb;
 	char b[BDEVNAME_SIZE];
@@ -567,6 +573,7 @@ restart:
 			goto restart;
 	}
 	spin_unlock(&sb_lock);
+	kfree(work);
 	printk(KERN_WARNING "Emergency Thaw complete\n");
 }
 
@@ -577,7 +584,13 @@ restart:
  */
 void emergency_thaw_all(void)
 {
-	pdflush_operation(do_thaw_all, 0);
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_thaw_all);
+		schedule_work(work);
+	}
 }
 
 /**
@@ -737,7 +750,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
-	struct address_space *mapping;
+	struct address_space *mapping, *prev_mapping = NULL;
 	int err = 0, err2;
 
 	INIT_LIST_HEAD(&tmp);
@@ -762,7 +775,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				ll_rw_block(SWRITE_SYNC, 1, &bh);
+				ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+
+				/*
+				 * Kick off IO for the previous mapping. Note
+				 * that we will not run the very last mapping,
+				 * wait_on_buffer() will do that for us
+				 * through sync_buffer().
+				 */
+				if (prev_mapping && prev_mapping != mapping)
+					blk_run_address_space(prev_mapping);
+				prev_mapping = mapping;
+
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -1585,9 +1609,20 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
  * locked buffer.   This only can happen if someone has written the buffer
  * directly, with submit_bh().  At the address_space level PageWriteback
  * prevents this contention from occurring.
+ *
+ * If block_write_full_page() is called with wbc->sync_mode ==
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * causes the writes to be flagged as synchronous writes, but the
+ * block device queue will NOT be unplugged, since usually many pages
+ * will be pushed to the out before the higher-level caller actually
+ * waits for the writes to be completed.  The various wait functions,
+ * such as wait_on_writeback_range() will ultimately call sync_page()
+ * which will ultimately call blk_run_backing_dev(), which will end up
+ * unplugging the device queue.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
-			get_block_t *get_block, struct writeback_control *wbc)
+			get_block_t *get_block, struct writeback_control *wbc,
+			bh_end_io_t *handler)
 {
 	int err;
 	sector_t block;
@@ -1595,7 +1630,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
-	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC_PLUG : WRITE);
 
 	BUG_ON(!PageLocked(page));
 
@@ -1671,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
-			mark_buffer_async_write(bh);
+			mark_buffer_async_write_endio(bh, handler);
 		} else {
 			unlock_buffer(bh);
 		}
@@ -1724,7 +1760,7 @@ recover:
 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
 		    !buffer_delay(bh)) {
 			lock_buffer(bh);
-			mark_buffer_async_write(bh);
+			mark_buffer_async_write_endio(bh, handler);
 		} else {
 			/*
 			 * The buffer may have been set dirty during
@@ -2361,7 +2397,8 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_offset(page) > size)) {
 		/* page got truncated out from underneath us */
-		goto out_unlock;
+		unlock_page(page);
+		goto out;
 	}
 
 	/* page is wholly or partially inside EOF */
@@ -2375,14 +2412,15 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		ret = block_commit_write(page, 0, end);
 
 	if (unlikely(ret)) {
+		unlock_page(page);
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
 		else /* -ENOSPC, -EIO, etc */
 			ret = VM_FAULT_SIGBUS;
-	}
+	} else
+		ret = VM_FAULT_LOCKED;
 
-out_unlock:
-	unlock_page(page);
+out:
 	return ret;
 }
 
@@ -2650,7 +2688,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
-		ret = __block_write_full_page(inode, page, get_block, wbc);
+		ret = __block_write_full_page(inode, page, get_block, wbc,
+					      end_buffer_async_write);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
@@ -2808,9 +2847,10 @@ out:
 
 /*
  * The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
  */
-int block_write_full_page(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc)
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc, bh_end_io_t *handler)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
@@ -2819,7 +2859,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
-		return __block_write_full_page(inode, page, get_block, wbc);
+		return __block_write_full_page(inode, page, get_block, wbc,
+					       handler);
 
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2842,9 +2883,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-	return __block_write_full_page(inode, page, get_block, wbc);
+	return __block_write_full_page(inode, page, get_block, wbc, handler);
 }
 
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	return block_write_full_page_endio(page, get_block, wbc,
+					   end_buffer_async_write);
+}
+
+
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 			    get_block_t *get_block)
 {
@@ -2957,12 +3009,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE || rw == SWRITE_SYNC)
+		if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
 			lock_buffer(bh);
 		else if (!trylock_buffer(bh))
 			continue;
 
-		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+		    rw == SWRITE_SYNC_PLUG) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
@@ -2998,7 +3051,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
@@ -3312,9 +3365,11 @@ EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_write_full_page_endio);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 65984006192c..f20c4069c220 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,16 @@
+Version 1.58
+------------
+Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
+when the UTF-8 string is composed of unusually long (more than 4 byte) converted
+characters. Add support for mounting root of a share which redirects immediately
+to DFS target. Convert string conversion functions from Unicode to more
+accurately mark string length before allocating memory (which may help the
+rare cases where a UTF-8 string is much larger than the UCS2 string that
+we converted from).  Fix endianness of the vcnum field used during
+session setup to distinguish multiple mounts to same server from different
+userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
+flag to be set to 2, and mount must enable krb5 to turn on extended security).
+ 
 Version 1.57
 ------------
 Improve support for multiple security contexts to the same server. We
@@ -15,7 +28,8 @@ Posix file open support added (turned off after one attempt if server
 fails to support it properly, as with Samba server versions prior to 3.3.2)
 Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
 little memory for the "nativeFileSystem" field returned by the server
-during mount). 
+during mount).  Endian convert inode numbers if necessary (makes it easier
+to compare inode numbers on network files from big endian systems). 
 
 Version 1.56
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 07434181623b..db208ddb9899 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -651,7 +651,15 @@ Experimental            When set to 1 used to enable certain experimental
 			signing turned on in case buffer was modified
 			just before it was sent, also this flag will
 			be used to use the new experimental directory change 
-			notification code).
+			notification code).  When set to 2 enables
+			an additional experimental feature, "raw ntlmssp"
+			session establishment support (which allows
+			specifying "sec=ntlmssp" on mount). The Linux cifs
+			module will use ntlmv2 authentication encapsulated
+			in "raw ntlmssp" (not using SPNEGO) when
+			"sec=ntlmssp" is specified on mount.
+			This support also requires building cifs with
+			the CONFIG_CIFS_EXPERIMENTAL configuration flag.
 
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 5fdbf8a14472..83d62759c7c7 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -340,28 +340,24 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 	for (i = 0; i < num_referrals; i++) {
+		int len;
 		dump_referral(referrals+i);
-		/* connect to a storage node */
-		if (referrals[i].flags & DFSREF_STORAGE_SERVER) {
-			int len;
-			len = strlen(referrals[i].node_name);
-			if (len < 2) {
-				cERROR(1, ("%s: Net Address path too short: %s",
+		/* connect to a node */
+		len = strlen(referrals[i].node_name);
+		if (len < 2) {
+			cERROR(1, ("%s: Net Address path too short: %s",
 					__func__, referrals[i].node_name));
-				rc = -EINVAL;
-				goto out_err;
-			}
-			mnt = cifs_dfs_do_refmount(nd->path.mnt,
-						nd->path.dentry,
-						referrals + i);
-			cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
-					 __func__,
+			rc = -EINVAL;
+			goto out_err;
+		}
+		mnt = cifs_dfs_do_refmount(nd->path.mnt,
+				nd->path.dentry, referrals + i);
+		cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
 					referrals[i].node_name, mnt));
 
-			/* complete mount procedure if we accured submount */
-			if (!IS_ERR(mnt))
-				break;
-		}
+		/* complete mount procedure if we accured submount */
+		if (!IS_ERR(mnt))
+			break;
 	}
 
 	/* we need it cause for() above could exit without valid submount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 3fd3a9df043a..67bf93a40d2e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -41,7 +41,7 @@ cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
 
 	/* attach the data */
 	memcpy(payload, data, datalen);
-	rcu_assign_pointer(key->payload.data, payload);
+	key->payload.data = payload;
 	ret = 0;
 
 error:
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7d75272a6b3f..60e3c4253de0 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifs_unicode.c
  *
- *   Copyright (c) International Business Machines  Corp., 2000,2005
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
  *   Modified by Steve French (sfrench@us.ibm.com)
  *
  *   This program is free software;  you can redistribute it and/or modify
@@ -26,31 +26,157 @@
 #include "cifs_debug.h"
 
 /*
- * NAME:	cifs_strfromUCS()
- *
- * FUNCTION:	Convert little-endian unicode string to character string
+ * cifs_ucs2_bytes - how long will a string be after conversion?
+ * @ucs - pointer to input string
+ * @maxbytes - don't go past this many bytes of input string
+ * @codepage - destination codepage
  *
+ * Walk a ucs2le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
  */
 int
-cifs_strfromUCS_le(char *to, const __le16 *from,
-		   int len, const struct nls_table *codepage)
+cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+		const struct nls_table *codepage)
 {
 	int i;
-	int outlen = 0;
+	int charlen, outlen = 0;
+	int maxwords = maxbytes / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
 
-	for (i = 0; (i < len) && from[i]; i++) {
-		int charlen;
-		/* 2.4.0 kernel or greater */
-		charlen =
-		    codepage->uni2char(le16_to_cpu(from[i]), &to[outlen],
-				       NLS_MAX_CHARSET_SIZE);
-		if (charlen > 0) {
+	for (i = 0; from[i] && i < maxwords; i++) {
+		charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
+					     NLS_MAX_CHARSET_SIZE);
+		if (charlen > 0)
 			outlen += charlen;
-		} else {
-			to[outlen++] = '?';
+		else
+			outlen++;
+	}
+
+	return outlen;
+}
+
+/*
+ * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * @target - where converted character should be copied
+ * @src_char - 2 byte little-endian source character
+ * @cp - codepage to which character should be converted
+ * @mapchar - should character be mapped according to mapchars mount option?
+ *
+ * This function handles the conversion of a single character. It is the
+ * responsibility of the caller to ensure that the target buffer is large
+ * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
+ */
+static int
+cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+	     bool mapchar)
+{
+	int len = 1;
+
+	if (!mapchar)
+		goto cp_convert;
+
+	/*
+	 * BB: Cannot handle remapping UNI_SLASH until all the calls to
+	 *     build_path_from_dentry are modified, as they use slash as
+	 *     separator.
+	 */
+	switch (le16_to_cpu(src_char)) {
+	case UNI_COLON:
+		*target = ':';
+		break;
+	case UNI_ASTERIK:
+		*target = '*';
+		break;
+	case UNI_QUESTION:
+		*target = '?';
+		break;
+	case UNI_PIPE:
+		*target = '|';
+		break;
+	case UNI_GRTRTHAN:
+		*target = '>';
+		break;
+	case UNI_LESSTHAN:
+		*target = '<';
+		break;
+	default:
+		goto cp_convert;
+	}
+
+out:
+	return len;
+
+cp_convert:
+	len = cp->uni2char(le16_to_cpu(src_char), target,
+			   NLS_MAX_CHARSET_SIZE);
+	if (len <= 0) {
+		*target = '?';
+		len = 1;
+	}
+	goto out;
+}
+
+/*
+ * cifs_from_ucs2 - convert utf16le string to local charset
+ * @to - destination buffer
+ * @from - source buffer
+ * @tolen - destination buffer size (in bytes)
+ * @fromlen - source buffer size (in bytes)
+ * @codepage - codepage to which characters should be converted
+ * @mapchar - should characters be remapped according to the mapchars option?
+ *
+ * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * in the provided codepage. The tolen and fromlen parameters are to ensure
+ * that the code doesn't walk off of the end of the buffer (which is always
+ * a danger if the alignment of the source buffer is off). The destination
+ * string is always properly null terminated and fits in the destination
+ * buffer. Returns the length of the destination string in bytes (including
+ * null terminator).
+ *
+ * Note that some windows versions actually send multiword UTF-16 characters
+ * instead of straight UCS-2. The linux nls routines however aren't able to
+ * deal with those characters properly. In the event that we get some of
+ * those characters, they won't be translated properly.
+ */
+int
+cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+		 const struct nls_table *codepage, bool mapchar)
+{
+	int i, charlen, safelen;
+	int outlen = 0;
+	int nullsize = nls_nullsize(codepage);
+	int fromwords = fromlen / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+
+	/*
+	 * because the chars can be of varying widths, we need to take care
+	 * not to overflow the destination buffer when we get close to the
+	 * end of it. Until we get to this offset, we don't need to check
+	 * for overflow however.
+	 */
+	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
+
+	for (i = 0; i < fromwords && from[i]; i++) {
+		/*
+		 * check to see if converting this character might make the
+		 * conversion bleed into the null terminator
+		 */
+		if (outlen >= safelen) {
+			charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+			if ((outlen + charlen) > (tolen - nullsize))
+				break;
 		}
+
+		/* put converted char into 'to' buffer */
+		charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+		outlen += charlen;
 	}
-	to[outlen] = 0;
+
+	/* properly null-terminate string */
+	for (i = 0; i < nullsize; i++)
+		to[outlen++] = 0;
+
 	return outlen;
 }
 
@@ -88,3 +214,41 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 	return i;
 }
 
+/*
+ * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * @src - source string
+ * @maxlen - don't walk past this many bytes in the source string
+ * @is_unicode - is this a unicode string?
+ * @codepage - destination codepage
+ *
+ * Take a string given by the server, convert it to the local codepage and
+ * put it in a new buffer. Returns a pointer to the new string or NULL on
+ * error.
+ */
+char *
+cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
+	     const struct nls_table *codepage)
+{
+	int len;
+	char *dst;
+
+	if (is_unicode) {
+		len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+		len += nls_nullsize(codepage);
+		dst = kmalloc(len, GFP_KERNEL);
+		if (!dst)
+			return NULL;
+		cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+			       false);
+	} else {
+		len = strnlen(src, maxlen);
+		len++;
+		dst = kmalloc(len, GFP_KERNEL);
+		if (!dst)
+			return NULL;
+		strlcpy(dst, src, len);
+	}
+
+	return dst;
+}
+
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 14eb9a2395d3..650638275a6f 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -5,7 +5,7 @@
  *     Convert a unicode character to upper or lower case using
  *     compressed tables.
  *
- *   Copyright (c) International Business Machines  Corp., 2000,2007
+ *   Copyright (c) International Business Machines  Corp., 2000,2009
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
@@ -37,6 +37,19 @@
 
 #define  UNIUPR_NOLOWER		/* Example to not expand lower case tables */
 
+/*
+ * Windows maps these to the user defined 16 bit Unicode range since they are
+ * reserved symbols (along with \ and /), otherwise illegal to store
+ * in filenames in NTFS
+ */
+#define UNI_ASTERIK     (__u16) ('*' + 0xF000)
+#define UNI_QUESTION    (__u16) ('?' + 0xF000)
+#define UNI_COLON       (__u16) (':' + 0xF000)
+#define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
+#define UNI_LESSTHAN    (__u16) ('<' + 0xF000)
+#define UNI_PIPE        (__u16) ('|' + 0xF000)
+#define UNI_SLASH       (__u16) ('\\' + 0xF000)
+
 /* Just define what we want from uniupr.h.  We don't want to define the tables
  * in each source file.
  */
@@ -59,8 +72,14 @@ extern struct UniCaseRange UniLowerRange[];
 #endif				/* UNIUPR_NOLOWER */
 
 #ifdef __KERNEL__
-int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *);
+int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+		   const struct nls_table *codepage, bool mapchar);
+int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+		    const struct nls_table *codepage);
 int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_ucs(const char *src, const int maxlen,
+			    const bool is_unicode,
+			    const struct nls_table *codepage);
 #endif
 
 /*
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 38491fd3871d..5e6d35804d73 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,6 +35,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/smp_lock.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -66,9 +67,6 @@ unsigned int sign_CIFS_PDUs = 1;
 extern struct task_struct *oplockThread; /* remove sparse warning */
 struct task_struct *oplockThread = NULL;
 /* extern struct task_struct * dnotifyThread; remove sparse warning */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static struct task_struct *dnotifyThread = NULL;
-#endif
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, int, 0);
@@ -316,6 +314,7 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode->clientCanCacheAll = false;
 	cifs_inode->delete_pending = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
+	cifs_inode->server_eof = 0;
 
 	/* Can not set i_flags here - they get immediately overwritten
 	   to zero by the VFS */
@@ -532,6 +531,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	if (tcon == NULL)
 		return;
 
+	lock_kernel();
 	read_lock(&cifs_tcp_ses_lock);
 	if (tcon->tc_count == 1)
 		tcon->tidStatus = CifsExiting;
@@ -550,6 +550,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	}
 /* BB FIXME - finish add checks for tidStatus BB */
 
+	unlock_kernel();
 	return;
 }
 
@@ -601,8 +602,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 
 	rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
 	if (rc) {
-		up_write(&sb->s_umount);
-		deactivate_super(sb);
+		deactivate_locked_super(sb);
 		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
@@ -1040,34 +1040,6 @@ static int cifs_oplock_thread(void *dummyarg)
 	return 0;
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static int cifs_dnotify_thread(void *dummyarg)
-{
-	struct list_head *tmp;
-	struct TCP_Server_Info *server;
-
-	do {
-		if (try_to_freeze())
-			continue;
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(15*HZ);
-		/* check if any stuck requests that need
-		   to be woken up and wakeq so the
-		   thread can wake up and error out */
-		read_lock(&cifs_tcp_ses_lock);
-		list_for_each(tmp, &cifs_tcp_ses_list) {
-			server = list_entry(tmp, struct TCP_Server_Info,
-					 tcp_ses_list);
-			if (atomic_read(&server->inFlight))
-				wake_up_all(&server->response_q);
-		}
-		read_unlock(&cifs_tcp_ses_lock);
-	} while (!kthread_should_stop());
-
-	return 0;
-}
-#endif
-
 static int __init
 init_cifs(void)
 {
@@ -1144,21 +1116,8 @@ init_cifs(void)
 		goto out_unregister_dfs_key_type;
 	}
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-	dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
-	if (IS_ERR(dnotifyThread)) {
-		rc = PTR_ERR(dnotifyThread);
-		cERROR(1, ("error %d create dnotify thread", rc));
-		goto out_stop_oplock_thread;
-	}
-#endif
-
 	return 0;
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- out_stop_oplock_thread:
-#endif
-	kthread_stop(oplockThread);
  out_unregister_dfs_key_type:
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	unregister_key_type(&key_type_dns_resolver);
@@ -1196,9 +1155,6 @@ exit_cifs(void)
 	cifs_destroy_inodecache();
 	cifs_destroy_mids();
 	cifs_destroy_request_bufs();
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-	kthread_stop(dnotifyThread);
-#endif
 	kthread_stop(oplockThread);
 }
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 77e190dc2883..051b71cfdea9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.57"
+#define CIFS_VERSION   "1.58"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9fbf4dff5da6..a61ab772c6f6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -82,8 +82,8 @@ enum securityEnum {
 	LANMAN,			/* Legacy LANMAN auth */
 	NTLM,			/* Legacy NTLM012 auth with NTLM hash */
 	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
-	RawNTLMSSP,		/* NTLMSSP without SPNEGO */
-	NTLMSSP,		/* NTLMSSP via SPNEGO */
+	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
+	NTLMSSP,		/* NTLMSSP via SPNEGO, NTLMv2 hash */
 	Kerberos,		/* Kerberos via SPNEGO */
 	MSKerberos,		/* MS Kerberos via SPNEGO */
 };
@@ -350,7 +350,7 @@ struct cifsFileInfo {
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool messageMode:1;	/* for pipes: message vs byte mode */
 	atomic_t wrtPending;   /* handle in use - defer close */
-	struct semaphore fh_sem; /* prevents reopen race after dead ses*/
+	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
 	struct cifs_search_info srch_inf;
 };
 
@@ -370,6 +370,7 @@ struct cifsInodeInfo {
 	bool clientCanCacheAll:1;	/* read and writebehind oplock */
 	bool oplockPending:1;
 	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
+	u64  server_eof;		/* current file size on server */
 	struct inode vfs_inode;
 };
 
@@ -530,6 +531,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFSSEC_MAY_PLNTXT    0
 #endif /* weak passwords */
 #define   CIFSSEC_MAY_SEAL	0x00040 /* not supported yet */
+#define   CIFSSEC_MAY_NTLMSSP	0x00080 /* raw ntlmssp with ntlmv2 */
 
 #define   CIFSSEC_MUST_SIGN	0x01001
 /* note that only one of the following can be set so the
@@ -542,22 +544,23 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_LANMAN	0x10010
 #define   CIFSSEC_MUST_PLNTXT	0x20020
 #ifdef CONFIG_CIFS_UPCALL
-#define   CIFSSEC_MASK          0x3F03F /* allows weak security but also krb5 */
+#define   CIFSSEC_MASK          0xAF0AF /* allows weak security but also krb5 */
 #else
-#define   CIFSSEC_MASK          0x37037 /* current flags supported if weak */
+#define   CIFSSEC_MASK          0xA70A7 /* current flags supported if weak */
 #endif /* UPCALL */
 #else /* do not allow weak pw hash */
 #ifdef CONFIG_CIFS_UPCALL
-#define   CIFSSEC_MASK          0x0F00F /* flags supported if no weak allowed */
+#define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
 #else
-#define	  CIFSSEC_MASK          0x07007 /* flags supported if no weak allowed */
+#define	  CIFSSEC_MASK          0x87087 /* flags supported if no weak allowed */
 #endif /* UPCALL */
 #endif /* WEAK_PW_HASH */
 #define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
+#define   CIFSSEC_MUST_NTLMSSP	0x80080 /* raw ntlmssp with ntlmv2 */
 
 #define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
 #define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
-#define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5)
+#define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
 /*
  *****************************************************************
  * All constants go here
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b370489c8da5..a785f69dbc9f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2163,7 +2163,7 @@ typedef struct {
 	__le32 Type;
 	__le64 DevMajor;
 	__le64 DevMinor;
-	__u64 UniqueId;
+	__le64 UniqueId;
 	__le64 Permissions;
 	__le64 Nlinks;
 } __attribute__((packed)) FILE_UNIX_BASIC_INFO;	/* level 0x200 QPathInfo */
@@ -2308,7 +2308,7 @@ struct unlink_psx_rq { /* level 0x20a SetPathInfo */
 } __attribute__((packed));
 
 struct file_internal_info {
-	__u64  UniqueId; /* inode number */
+	__le64  UniqueId; /* inode number */
 } __attribute__((packed));      /* level 0x3ee */
 
 struct file_mode_info {
@@ -2338,7 +2338,7 @@ typedef struct {
 	__le32 Type;
 	__le64 DevMajor;
 	__le64 DevMinor;
-	__u64 UniqueId;
+	__le64 UniqueId;
 	__le64 Permissions;
 	__le64 Nlinks;
 	char FileName[1];
@@ -2386,7 +2386,7 @@ typedef struct {
 	__le32 FileNameLength;
 	__le32 EaSize; /* EA size */
 	__le32 Reserved;
-	__u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
+	__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
 	char FileName[1];
 } __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 4167716d32f2..fae083930eee 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -260,8 +260,7 @@ extern int CIFSUnixCreateSymLink(const int xid,
 			const struct nls_table *nls_codepage);
 extern int CIFSSMBUnixQuerySymLink(const int xid,
 			struct cifsTconInfo *tcon,
-			const unsigned char *searchName,
-			char *syminfo, const int buflen,
+			const unsigned char *searchName, char **syminfo,
 			const struct nls_table *nls_codepage);
 extern int CIFSSMBQueryReparseLinkInfo(const int xid,
 			struct cifsTconInfo *tcon,
@@ -307,8 +306,6 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName, __u64 *inode_number,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
-			const struct nls_table *codepage);
 extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
 			const struct nls_table *cp, int mapChars);
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bc09c998631f..5759ba53dc96 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifssmb.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   Contains the routines for constructing the SMB PDUs themselves
@@ -81,41 +81,6 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
 
-/* Allocates buffer into dst and copies smb string from src to it.
- * caller is responsible for freeing dst if function returned 0.
- * returns:
- * 	on success - 0
- *	on failure - errno
- */
-static int
-cifs_strncpy_to_host(char **dst, const char *src, const int maxlen,
-		 const bool is_unicode, const struct nls_table *nls_codepage)
-{
-	int plen;
-
-	if (is_unicode) {
-		plen = UniStrnlen((wchar_t *)src, maxlen);
-		*dst = kmalloc(plen + 2, GFP_KERNEL);
-		if (!*dst)
-			goto cifs_strncpy_to_host_ErrExit;
-		cifs_strfromUCS_le(*dst, (__le16 *)src, plen, nls_codepage);
-	} else {
-		plen = strnlen(src, maxlen);
-		*dst = kmalloc(plen + 2, GFP_KERNEL);
-		if (!*dst)
-			goto cifs_strncpy_to_host_ErrExit;
-		strncpy(*dst, src, plen);
-	}
-	(*dst)[plen] = 0;
-	(*dst)[plen+1] = 0; /* harmless for ASCII case, needed for Unicode */
-	return 0;
-
-cifs_strncpy_to_host_ErrExit:
-	cERROR(1, ("Failed to allocate buffer for string\n"));
-	return -ENOMEM;
-}
-
-
 /* Mark as invalid, all open files on tree connections since they
    were closed when session to server was lost */
 static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
@@ -484,6 +449,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		cFYI(1, ("Kerberos only mechanism, enable extended security"));
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+	else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
+		cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+	}
+#endif
 
 	count = 0;
 	for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -620,6 +593,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->secType = NTLMv2;
 	else if (secFlags & CIFSSEC_MAY_KRB5)
 		server->secType = Kerberos;
+	else if (secFlags & CIFSSEC_MAY_NTLMSSP)
+		server->secType = NTLMSSP;
 	else if (secFlags & CIFSSEC_MAY_LANMAN)
 		server->secType = LANMAN;
 /* #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -1626,6 +1601,8 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 	int smb_hdr_len;
 	int resp_buf_type = 0;
 
+	*nbytes = 0;
+
 	cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
 
 	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
@@ -1682,11 +1659,9 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 	cifs_stats_inc(&tcon->num_writes);
 	if (rc) {
 		cFYI(1, ("Send error Write2 = %d", rc));
-		*nbytes = 0;
 	} else if (resp_buf_type == 0) {
 		/* presumably this can not happen, but best to be safe */
 		rc = -EIO;
-		*nbytes = 0;
 	} else {
 		WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
@@ -2417,8 +2392,7 @@ winCreateHardLinkRetry:
 
 int
 CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName,
-			char *symlinkinfo, const int buflen,
+			const unsigned char *searchName, char **symlinkinfo,
 			const struct nls_table *nls_codepage)
 {
 /* SMB_QUERY_FILE_UNIX_LINK */
@@ -2428,6 +2402,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len;
 	__u16 params, byte_count;
+	char *data_start;
 
 	cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
 
@@ -2482,30 +2457,26 @@ querySymLinkRetry:
 		/* decode response */
 
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		if (rc || (pSMBr->ByteCount < 2))
 		/* BB also check enough total bytes returned */
-			rc = -EIO;	/* bad smb */
+		if (rc || (pSMBr->ByteCount < 2))
+			rc = -EIO;
 		else {
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			bool is_unicode;
+			u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+
+			data_start = ((char *) &pSMBr->hdr.Protocol) +
+					   le16_to_cpu(pSMBr->t2.DataOffset);
+
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+				is_unicode = true;
+			else
+				is_unicode = false;
 
-			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-				name_len = UniStrnlen((wchar_t *) ((char *)
-					&pSMBr->hdr.Protocol + data_offset),
-					min_t(const int, buflen, count) / 2);
 			/* BB FIXME investigate remapping reserved chars here */
-				cifs_strfromUCS_le(symlinkinfo,
-					(__le16 *) ((char *)&pSMBr->hdr.Protocol
-							+ data_offset),
-					name_len, nls_codepage);
-			} else {
-				strncpy(symlinkinfo,
-					(char *) &pSMBr->hdr.Protocol +
-						data_offset,
-					min_t(const int, buflen, count));
-			}
-			symlinkinfo[buflen] = 0;
-	/* just in case so calling code does not go off the end of buffer */
+			*symlinkinfo = cifs_strndup_from_ucs(data_start, count,
+						    is_unicode, nls_codepage);
+			if (!symlinkinfo)
+				rc = -ENOMEM;
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -2603,7 +2574,6 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 	*pparmlen = parm_count;
 	return 0;
 }
-#endif /* CIFS_EXPERIMENTAL */
 
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
@@ -2613,7 +2583,6 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 {
 	int rc = 0;
 	int bytes_returned;
-	int name_len;
 	struct smb_com_transaction_ioctl_req *pSMB;
 	struct smb_com_transaction_ioctl_rsp *pSMBr;
 
@@ -2650,59 +2619,55 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	} else {		/* decode response */
 		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
 		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
-		if ((pSMBr->ByteCount < 2) || (data_offset > 512))
+		if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
 		/* BB also check enough total bytes returned */
 			rc = -EIO;	/* bad smb */
-		else {
-			if (data_count && (data_count < 2048)) {
-				char *end_of_smb = 2 /* sizeof byte count */ +
-						pSMBr->ByteCount +
-						(char *)&pSMBr->ByteCount;
+			goto qreparse_out;
+		}
+		if (data_count && (data_count < 2048)) {
+			char *end_of_smb = 2 /* sizeof byte count */ +
+				pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
 
-				struct reparse_data *reparse_buf =
+			struct reparse_data *reparse_buf =
 						(struct reparse_data *)
 						((char *)&pSMBr->hdr.Protocol
 								 + data_offset);
-				if ((char *)reparse_buf >= end_of_smb) {
-					rc = -EIO;
-					goto qreparse_out;
-				}
-				if ((reparse_buf->LinkNamesBuf +
-					reparse_buf->TargetNameOffset +
-					reparse_buf->TargetNameLen) >
-						end_of_smb) {
-					cFYI(1, ("reparse buf beyond SMB"));
-					rc = -EIO;
-					goto qreparse_out;
-				}
+			if ((char *)reparse_buf >= end_of_smb) {
+				rc = -EIO;
+				goto qreparse_out;
+			}
+			if ((reparse_buf->LinkNamesBuf +
+				reparse_buf->TargetNameOffset +
+				reparse_buf->TargetNameLen) > end_of_smb) {
+				cFYI(1, ("reparse buf beyond SMB"));
+				rc = -EIO;
+				goto qreparse_out;
+			}
 
-				if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-					name_len = UniStrnlen((wchar_t *)
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
+				cifs_from_ucs2(symlinkinfo, (__le16 *)
 						(reparse_buf->LinkNamesBuf +
 						reparse_buf->TargetNameOffset),
-						min(buflen/2,
-						reparse_buf->TargetNameLen / 2));
-					cifs_strfromUCS_le(symlinkinfo,
-						(__le16 *) (reparse_buf->LinkNamesBuf +
-						reparse_buf->TargetNameOffset),
-						name_len, nls_codepage);
-				} else { /* ASCII names */
-					strncpy(symlinkinfo,
-						reparse_buf->LinkNamesBuf +
-						reparse_buf->TargetNameOffset,
-						min_t(const int, buflen,
-						   reparse_buf->TargetNameLen));
-				}
-			} else {
-				rc = -EIO;
-				cFYI(1, ("Invalid return data count on "
-					 "get reparse info ioctl"));
+						buflen,
+						reparse_buf->TargetNameLen,
+						nls_codepage, 0);
+			} else { /* ASCII names */
+				strncpy(symlinkinfo,
+					reparse_buf->LinkNamesBuf +
+					reparse_buf->TargetNameOffset,
+					min_t(const int, buflen,
+					   reparse_buf->TargetNameLen));
 			}
-			symlinkinfo[buflen] = 0; /* just in case so the caller
-					does not go off the end of the buffer */
-			cFYI(1, ("readlink result - %s", symlinkinfo));
+		} else {
+			rc = -EIO;
+			cFYI(1, ("Invalid return data count on "
+				 "get reparse info ioctl"));
 		}
+		symlinkinfo[buflen] = 0; /* just in case so the caller
+					does not go off the end of the buffer */
+		cFYI(1, ("readlink result - %s", symlinkinfo));
 	}
+
 qreparse_out:
 	cifs_buf_release(pSMB);
 
@@ -2711,6 +2676,7 @@ qreparse_out:
 
 	return rc;
 }
+#endif /* CIFS_EXPERIMENTAL */
 
 #ifdef CONFIG_CIFS_POSIX
 
@@ -3918,7 +3884,7 @@ GetInodeNumberRetry:
 			}
 			pfinfo = (struct file_internal_info *)
 				(data_offset + (char *) &pSMBr->hdr.Protocol);
-			*inode_number = pfinfo->UniqueId;
+			*inode_number = le64_to_cpu(pfinfo->UniqueId);
 		}
 	}
 GetInodeNumOut:
@@ -3928,27 +3894,6 @@ GetInodeNumOut:
 	return rc;
 }
 
-/* computes length of UCS string converted to host codepage
- * @src:	UCS string
- * @maxlen:	length of the input string in UCS characters
- * 		(not in bytes)
- *
- * return:	size of input string in host codepage
- */
-static int hostlen_fromUCS(const __le16 *src, const int maxlen,
-		const struct nls_table *nls_codepage) {
-	int i;
-	int hostlen = 0;
-	char to[4];
-	int charlen;
-	for (i = 0; (i < maxlen) && src[i]; ++i) {
-		charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
-				to, NLS_MAX_CHARSET_SIZE);
-		hostlen += charlen > 0 ? charlen : 1;
-	}
-	return hostlen;
-}
-
 /* parses DFS refferal V3 structure
  * caller is responsible for freeing target_nodes
  * returns:
@@ -3994,7 +3939,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 
 	cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
 			*num_of_nodes,
-			le16_to_cpu(pSMBr->DFSFlags)));
+			le32_to_cpu(pSMBr->DFSFlags)));
 
 	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
 			*num_of_nodes, GFP_KERNEL);
@@ -4010,14 +3955,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		int max_len;
 		struct dfs_info3_param *node = (*target_nodes)+i;
 
-		node->flags = le16_to_cpu(pSMBr->DFSFlags);
+		node->flags = le32_to_cpu(pSMBr->DFSFlags);
 		if (is_unicode) {
 			__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
 						GFP_KERNEL);
 			cifsConvertToUCS((__le16 *) tmp, searchName,
 					PATH_MAX, nls_codepage, remap);
-			node->path_consumed = hostlen_fromUCS(tmp,
-					le16_to_cpu(pSMBr->PathConsumed)/2,
+			node->path_consumed = cifs_ucs2_bytes(tmp,
+					le16_to_cpu(pSMBr->PathConsumed),
 					nls_codepage);
 			kfree(tmp);
 		} else
@@ -4029,20 +3974,20 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		/* copy DfsPath */
 		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
 		max_len = data_end - temp;
-		rc = cifs_strncpy_to_host(&(node->path_name), temp,
-					max_len, is_unicode, nls_codepage);
-		if (rc)
+		node->path_name = cifs_strndup_from_ucs(temp, max_len,
+						      is_unicode, nls_codepage);
+		if (!node->path_name) {
+			rc = -ENOMEM;
 			goto parse_DFS_referrals_exit;
+		}
 
 		/* copy link target UNC */
 		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
 		max_len = data_end - temp;
-		rc = cifs_strncpy_to_host(&(node->node_name), temp,
-					max_len, is_unicode, nls_codepage);
-		if (rc)
-			goto parse_DFS_referrals_exit;
-
-		ref += le16_to_cpu(ref->Size);
+		node->node_name = cifs_strndup_from_ucs(temp, max_len,
+						      is_unicode, nls_codepage);
+		if (!node->node_name)
+			rc = -ENOMEM;
 	}
 
 parse_DFS_referrals_exit:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0de3b5615a22..4aa81a507b74 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/connect.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <net/ipv6.h>
@@ -978,6 +979,13 @@ cifs_parse_mount_options(char *options, const char *devname,
 				return 1;
 			} else if (strnicmp(value, "krb5", 4) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_KRB5;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+			} else if (strnicmp(value, "ntlmsspi", 8) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
+					CIFSSEC_MUST_SIGN;
+			} else if (strnicmp(value, "ntlmssp", 7) == 0) {
+				vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+#endif
 			} else if (strnicmp(value, "ntlmv2i", 7) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
 					CIFSSEC_MUST_SIGN;
@@ -2214,9 +2222,58 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
 	return rc;
 }
 
+static void
+cleanup_volume_info(struct smb_vol **pvolume_info)
+{
+	struct smb_vol *volume_info;
+
+	if (!pvolume_info && !*pvolume_info)
+		return;
+
+	volume_info = *pvolume_info;
+	kzfree(volume_info->password);
+	kfree(volume_info->UNC);
+	kfree(volume_info->prepath);
+	kfree(volume_info);
+	*pvolume_info = NULL;
+	return;
+}
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+/* build_path_to_root returns full path to root when
+ * we do not have an exiting connection (tcon) */
+static char *
+build_unc_path_to_root(const struct smb_vol *volume_info,
+		const struct cifs_sb_info *cifs_sb)
+{
+	char *full_path;
+
+	int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
+	full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
+	if (full_path == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	strncpy(full_path, volume_info->UNC, unc_len);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+		int i;
+		for (i = 0; i < unc_len; i++) {
+			if (full_path[i] == '\\')
+				full_path[i] = '/';
+		}
+	}
+
+	if (cifs_sb->prepathlen)
+		strncpy(full_path + unc_len, cifs_sb->prepath,
+				cifs_sb->prepathlen);
+
+	full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
+	return full_path;
+}
+#endif
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
-	   char *mount_data, const char *devname)
+		char *mount_data_global, const char *devname)
 {
 	int rc = 0;
 	int xid;
@@ -2225,6 +2282,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
 	char   *full_path;
+	char *mount_data = mount_data_global;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	struct dfs_info3_param *referrals = NULL;
+	unsigned int num_referrals = 0;
+	int referral_walks_count = 0;
+try_mount_again:
+#endif
+	full_path = NULL;
 
 	xid = GetXid();
 
@@ -2371,11 +2436,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				}
 			}
 
-			/* check for null share name ie connect to dfs root */
 			if ((strchr(volume_info->UNC + 3, '\\') == NULL)
 			    && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-				/* rc = connect_to_dfs_path(...) */
-				cFYI(1, ("DFS root not supported"));
+				cERROR(1, ("Missing share name"));
 				rc = -ENODEV;
 				goto mount_fail_check;
 			} else {
@@ -2392,7 +2455,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				}
 			}
 			if (rc)
-				goto mount_fail_check;
+				goto remote_path_check;
 			tcon->seal = volume_info->seal;
 			write_lock(&cifs_tcp_ses_lock);
 			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
@@ -2417,19 +2480,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
 	sb->s_time_gran = 100;
 
-mount_fail_check:
-	/* on error free sesinfo and tcon struct if needed */
-	if (rc) {
-		/* If find_unc succeeded then rc == 0 so we can not end */
-		/* up accidently freeing someone elses tcon struct */
-		if (tcon)
-			cifs_put_tcon(tcon);
-		else if (pSesInfo)
-			cifs_put_smb_ses(pSesInfo);
-		else
-			cifs_put_tcp_session(srvTcp);
-		goto out;
-	}
+	if (rc)
+		goto remote_path_check;
+
 	cifs_sb->tcon = tcon;
 
 	/* do not care if following two calls succeed - informational */
@@ -2461,7 +2514,9 @@ mount_fail_check:
 		cifs_sb->rsize = min(cifs_sb->rsize,
 			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
 
-	if (!rc && cifs_sb->prepathlen) {
+remote_path_check:
+	/* check if a whole path (including prepath) is not remote */
+	if (!rc && cifs_sb->prepathlen && tcon) {
 		/* build_path_to_root works only when we have a valid tcon */
 		full_path = cifs_build_path_to_root(cifs_sb);
 		if (full_path == NULL) {
@@ -2469,1079 +2524,91 @@ mount_fail_check:
 			goto mount_fail_check;
 		}
 		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-		if (rc) {
-			cERROR(1, ("Path %s in not accessible: %d",
-						full_path, rc));
+		if (rc != -EREMOTE) {
 			kfree(full_path);
 			goto mount_fail_check;
 		}
 		kfree(full_path);
 	}
 
-	/* volume_info->password is freed above when existing session found
-	(in which case it is not needed anymore) but when new sesion is created
-	the password ptr is put in the new session structure (in which case the
-	password will be freed at unmount time) */
-out:
-	/* zero out password before freeing */
-	if (volume_info) {
-		if (volume_info->password != NULL) {
-			memset(volume_info->password, 0,
-				strlen(volume_info->password));
-			kfree(volume_info->password);
-		}
-		kfree(volume_info->UNC);
-		kfree(volume_info->prepath);
-		kfree(volume_info);
-	}
-	FreeXid(xid);
-	return rc;
-}
-
-static int
-CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-	      char session_key[CIFS_SESS_KEY_SIZE],
-	      const struct nls_table *nls_codepage)
-{
-	struct smb_hdr *smb_buffer;
-	struct smb_hdr *smb_buffer_response;
-	SESSION_SETUP_ANDX *pSMB;
-	SESSION_SETUP_ANDX *pSMBr;
-	char *bcc_ptr;
-	char *user;
-	char *domain;
-	int rc = 0;
-	int remaining_words = 0;
-	int bytes_returned = 0;
-	int len;
-	__u32 capabilities;
-	__u16 count;
-
-	cFYI(1, ("In sesssetup"));
-	if (ses == NULL)
-		return -EINVAL;
-	user = ses->userName;
-	domain = ses->domainName;
-	smb_buffer = cifs_buf_get();
-
-	if (smb_buffer == NULL)
-		return -ENOMEM;
-
-	smb_buffer_response = smb_buffer;
-	pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
-
-	/* send SMBsessionSetup here */
-	header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-			NULL /* no tCon exists yet */ , 13 /* wct */ );
-
-	smb_buffer->Mid = GetNextMid(ses->server);
-	pSMB->req_no_secext.AndXCommand = 0xFF;
-	pSMB->req_no_secext.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-	pSMB->req_no_secext.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-
-	if (ses->server->secMode &
-			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
-	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-		CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
-	if (ses->capabilities & CAP_UNICODE) {
-		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-		capabilities |= CAP_UNICODE;
-	}
-	if (ses->capabilities & CAP_STATUS32) {
-		smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-		capabilities |= CAP_STATUS32;
-	}
-	if (ses->capabilities & CAP_DFS) {
-		smb_buffer->Flags2 |= SMBFLG2_DFS;
-		capabilities |= CAP_DFS;
-	}
-	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
-
-	pSMB->req_no_secext.CaseInsensitivePasswordLength =
-		cpu_to_le16(CIFS_SESS_KEY_SIZE);
-
-	pSMB->req_no_secext.CaseSensitivePasswordLength =
-	    cpu_to_le16(CIFS_SESS_KEY_SIZE);
-	bcc_ptr = pByteArea(smb_buffer);
-	memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
-	bcc_ptr += CIFS_SESS_KEY_SIZE;
-	memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
-	bcc_ptr += CIFS_SESS_KEY_SIZE;
-
-	if (ses->capabilities & CAP_UNICODE) {
-		if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode */
-			*bcc_ptr = 0;
-			bcc_ptr++;
-		}
-		if (user == NULL)
-			bytes_returned = 0; /* skip null user */
-		else
-			bytes_returned =
-				cifs_strtoUCS((__le16 *) bcc_ptr, user, 100,
-					nls_codepage);
-		/* convert number of 16 bit words to bytes */
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;	/* trailing null */
-		if (domain == NULL)
-			bytes_returned =
-			    cifs_strtoUCS((__le16 *) bcc_ptr,
-					  "CIFS_LINUX_DOM", 32, nls_codepage);
-		else
-			bytes_returned =
-			    cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
-					  nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
-				  32, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release,
-				  32, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-				  64, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;
-	} else {
-		if (user != NULL) {
-		    strncpy(bcc_ptr, user, 200);
-		    bcc_ptr += strnlen(user, 200);
-		}
-		*bcc_ptr = 0;
-		bcc_ptr++;
-		if (domain == NULL) {
-			strcpy(bcc_ptr, "CIFS_LINUX_DOM");
-			bcc_ptr += strlen("CIFS_LINUX_DOM") + 1;
-		} else {
-			strncpy(bcc_ptr, domain, 64);
-			bcc_ptr += strnlen(domain, 64);
-			*bcc_ptr = 0;
-			bcc_ptr++;
-		}
-		strcpy(bcc_ptr, "Linux version ");
-		bcc_ptr += strlen("Linux version ");
-		strcpy(bcc_ptr, utsname()->release);
-		bcc_ptr += strlen(utsname()->release) + 1;
-		strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
-		bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-	}
-	count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-	smb_buffer->smb_buf_length += count;
-	pSMB->req_no_secext.ByteCount = cpu_to_le16(count);
-
-	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-			 &bytes_returned, CIFS_LONG_OP);
-	if (rc) {
-/* rc = map_smb_to_linux_error(smb_buffer_response); now done in SendReceive */
-	} else if ((smb_buffer_response->WordCount == 3)
-		   || (smb_buffer_response->WordCount == 4)) {
-		__u16 action = le16_to_cpu(pSMBr->resp.Action);
-		__u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
-		if (action & GUEST_LOGIN)
-			cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
-		ses->Suid = smb_buffer_response->Uid; /* UID left in wire format
-							 (little endian) */
-		cFYI(1, ("UID = %d ", ses->Suid));
-	/* response can have either 3 or 4 word count - Samba sends 3 */
-		bcc_ptr = pByteArea(smb_buffer_response);
-		if ((pSMBr->resp.hdr.WordCount == 3)
-		    || ((pSMBr->resp.hdr.WordCount == 4)
-			&& (blob_len < pSMBr->resp.ByteCount))) {
-			if (pSMBr->resp.hdr.WordCount == 4)
-				bcc_ptr += blob_len;
-
-			if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-				if ((long) (bcc_ptr) % 2) {
-					remaining_words =
-					    (BCC(smb_buffer_response) - 1) / 2;
-					/* Unicode strings must be word
-					   aligned */
-					bcc_ptr++;
-				} else {
-					remaining_words =
-						BCC(smb_buffer_response) / 2;
-				}
-				len =
-				    UniStrnlen((wchar_t *) bcc_ptr,
-					       remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-				if (ses->serverOS)
-					kfree(ses->serverOS);
-				ses->serverOS = kzalloc(2 * (len + 1),
-							GFP_KERNEL);
-				if (ses->serverOS == NULL)
-					goto sesssetup_nomem;
-				cifs_strfromUCS_le(ses->serverOS,
-						   (__le16 *)bcc_ptr,
-						   len, nls_codepage);
-				bcc_ptr += 2 * (len + 1);
-				remaining_words -= len + 1;
-				ses->serverOS[2 * len] = 0;
-				ses->serverOS[1 + (2 * len)] = 0;
-				if (remaining_words > 0) {
-					len = UniStrnlen((wchar_t *)bcc_ptr,
-							 remaining_words-1);
-					kfree(ses->serverNOS);
-					ses->serverNOS = kzalloc(2 * (len + 1),
-								 GFP_KERNEL);
-					if (ses->serverNOS == NULL)
-						goto sesssetup_nomem;
-					cifs_strfromUCS_le(ses->serverNOS,
-							   (__le16 *)bcc_ptr,
-							   len, nls_codepage);
-					bcc_ptr += 2 * (len + 1);
-					ses->serverNOS[2 * len] = 0;
-					ses->serverNOS[1 + (2 * len)] = 0;
-					if (strncmp(ses->serverNOS,
-						"NT LAN Manager 4", 16) == 0) {
-						cFYI(1, ("NT4 server"));
-						ses->flags |= CIFS_SES_NT4;
-					}
-					remaining_words -= len + 1;
-					if (remaining_words > 0) {
-						len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
-				/* last string is not always null terminated
-				   (for e.g. for Windows XP & 2000) */
-						if (ses->serverDomain)
-							kfree(ses->serverDomain);
-						ses->serverDomain =
-						    kzalloc(2*(len+1),
-							    GFP_KERNEL);
-						if (ses->serverDomain == NULL)
-							goto sesssetup_nomem;
-						cifs_strfromUCS_le(ses->serverDomain,
-							(__le16 *)bcc_ptr,
-							len, nls_codepage);
-						bcc_ptr += 2 * (len + 1);
-						ses->serverDomain[2*len] = 0;
-						ses->serverDomain[1+(2*len)] = 0;
-					} else { /* else no more room so create
-						  dummy domain string */
-						if (ses->serverDomain)
-							kfree(ses->serverDomain);
-						ses->serverDomain =
-							kzalloc(2, GFP_KERNEL);
-					}
-				} else { /* no room so create dummy domain
-					    and NOS string */
-
-					/* if these kcallocs fail not much we
-					   can do, but better to not fail the
-					   sesssetup itself */
-					kfree(ses->serverDomain);
-					ses->serverDomain =
-					    kzalloc(2, GFP_KERNEL);
-					kfree(ses->serverNOS);
-					ses->serverNOS =
-					    kzalloc(2, GFP_KERNEL);
-				}
-			} else {	/* ASCII */
-				len = strnlen(bcc_ptr, 1024);
-				if (((long) bcc_ptr + len) - (long)
-				    pByteArea(smb_buffer_response)
-					    <= BCC(smb_buffer_response)) {
-					kfree(ses->serverOS);
-					ses->serverOS = kzalloc(len + 1,
-								GFP_KERNEL);
-					if (ses->serverOS == NULL)
-						goto sesssetup_nomem;
-					strncpy(ses->serverOS, bcc_ptr, len);
-
-					bcc_ptr += len;
-					/* null terminate the string */
-					bcc_ptr[0] = 0;
-					bcc_ptr++;
-
-					len = strnlen(bcc_ptr, 1024);
-					kfree(ses->serverNOS);
-					ses->serverNOS = kzalloc(len + 1,
-								 GFP_KERNEL);
-					if (ses->serverNOS == NULL)
-						goto sesssetup_nomem;
-					strncpy(ses->serverNOS, bcc_ptr, len);
-					bcc_ptr += len;
-					bcc_ptr[0] = 0;
-					bcc_ptr++;
-
-					len = strnlen(bcc_ptr, 1024);
-					if (ses->serverDomain)
-						kfree(ses->serverDomain);
-					ses->serverDomain = kzalloc(len + 1,
-								    GFP_KERNEL);
-					if (ses->serverDomain == NULL)
-						goto sesssetup_nomem;
-					strncpy(ses->serverDomain, bcc_ptr,
-						len);
-					bcc_ptr += len;
-					bcc_ptr[0] = 0;
-					bcc_ptr++;
-				} else
-					cFYI(1,
-					     ("Variable field of length %d "
-						"extends beyond end of smb ",
-					      len));
-			}
-		} else {
-			cERROR(1, ("Security Blob Length extends beyond "
-				"end of SMB"));
+	/* get referral if needed */
+	if (rc == -EREMOTE) {
+#ifdef CONFIG_CIFS_DFS_UPCALL
+		if (referral_walks_count > MAX_NESTED_LINKS) {
+			/*
+			 * BB: when we implement proper loop detection,
+			 *     we will remove this check. But now we need it
+			 *     to prevent an indefinite loop if 'DFS tree' is
+			 *     misconfigured (i.e. has loops).
+			 */
+			rc = -ELOOP;
+			goto mount_fail_check;
 		}
-	} else {
-		cERROR(1, ("Invalid Word count %d: ",
-			smb_buffer_response->WordCount));
-		rc = -EIO;
-	}
-sesssetup_nomem:	/* do not return an error on nomem for the info strings,
-			   since that could make reconnection harder, and
-			   reconnection might be needed to free memory */
-	cifs_buf_release(smb_buffer);
-
-	return rc;
-}
-
-static int
-CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
-			      struct cifsSesInfo *ses, bool *pNTLMv2_flag,
-			      const struct nls_table *nls_codepage)
-{
-	struct smb_hdr *smb_buffer;
-	struct smb_hdr *smb_buffer_response;
-	SESSION_SETUP_ANDX *pSMB;
-	SESSION_SETUP_ANDX *pSMBr;
-	char *bcc_ptr;
-	char *domain;
-	int rc = 0;
-	int remaining_words = 0;
-	int bytes_returned = 0;
-	int len;
-	int SecurityBlobLength = sizeof(NEGOTIATE_MESSAGE);
-	PNEGOTIATE_MESSAGE SecurityBlob;
-	PCHALLENGE_MESSAGE SecurityBlob2;
-	__u32 negotiate_flags, capabilities;
-	__u16 count;
-
-	cFYI(1, ("In NTLMSSP sesssetup (negotiate)"));
-	if (ses == NULL)
-		return -EINVAL;
-	domain = ses->domainName;
-	*pNTLMv2_flag = false;
-	smb_buffer = cifs_buf_get();
-	if (smb_buffer == NULL) {
-		return -ENOMEM;
-	}
-	smb_buffer_response = smb_buffer;
-	pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
-	pSMBr = (SESSION_SETUP_ANDX *) smb_buffer_response;
-
-	/* send SMBsessionSetup here */
-	header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-			NULL /* no tCon exists yet */ , 12 /* wct */ );
-
-	smb_buffer->Mid = GetNextMid(ses->server);
-	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
-
-	pSMB->req.AndXCommand = 0xFF;
-	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-
-	if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
-	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-	    CAP_EXTENDED_SECURITY;
-	if (ses->capabilities & CAP_UNICODE) {
-		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-		capabilities |= CAP_UNICODE;
-	}
-	if (ses->capabilities & CAP_STATUS32) {
-		smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-		capabilities |= CAP_STATUS32;
-	}
-	if (ses->capabilities & CAP_DFS) {
-		smb_buffer->Flags2 |= SMBFLG2_DFS;
-		capabilities |= CAP_DFS;
-	}
-	pSMB->req.Capabilities = cpu_to_le32(capabilities);
-
-	bcc_ptr = (char *) &pSMB->req.SecurityBlob;
-	SecurityBlob = (PNEGOTIATE_MESSAGE) bcc_ptr;
-	strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
-	SecurityBlob->MessageType = NtLmNegotiate;
-	negotiate_flags =
-	    NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_OEM |
-	    NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_NTLM |
-	    NTLMSSP_NEGOTIATE_56 |
-	    /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128;
-	if (sign_CIFS_PDUs)
-		negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN;
-/*	if (ntlmv2_support)
-		negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;*/
-	/* setup pointers to domain name and workstation name */
-	bcc_ptr += SecurityBlobLength;
-
-	SecurityBlob->WorkstationName.Buffer = 0;
-	SecurityBlob->WorkstationName.Length = 0;
-	SecurityBlob->WorkstationName.MaximumLength = 0;
-
-	/* Domain not sent on first Sesssetup in NTLMSSP, instead it is sent
-	along with username on auth request (ie the response to challenge) */
-	SecurityBlob->DomainName.Buffer = 0;
-	SecurityBlob->DomainName.Length = 0;
-	SecurityBlob->DomainName.MaximumLength = 0;
-	if (ses->capabilities & CAP_UNICODE) {
-		if ((long) bcc_ptr % 2) {
-			*bcc_ptr = 0;
-			bcc_ptr++;
+		/* convert forward to back slashes in prepath here if needed */
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+			convert_delimiter(cifs_sb->prepath,
+					CIFS_DIR_SEP(cifs_sb));
+		full_path = build_unc_path_to_root(volume_info, cifs_sb);
+		if (IS_ERR(full_path)) {
+			rc = PTR_ERR(full_path);
+			goto mount_fail_check;
 		}
 
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
-				  32, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
-				  nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;	/* null terminate Linux version */
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-				  64, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		*(bcc_ptr + 1) = 0;
-		*(bcc_ptr + 2) = 0;
-		bcc_ptr += 2;	/* null terminate network opsys string */
-		*(bcc_ptr + 1) = 0;
-		*(bcc_ptr + 2) = 0;
-		bcc_ptr += 2;	/* null domain */
-	} else {		/* ASCII */
-		strcpy(bcc_ptr, "Linux version ");
-		bcc_ptr += strlen("Linux version ");
-		strcpy(bcc_ptr, utsname()->release);
-		bcc_ptr += strlen(utsname()->release) + 1;
-		strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
-		bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-		bcc_ptr++;	/* empty domain field */
-		*bcc_ptr = 0;
-	}
-	SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
-	pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
-	count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-	smb_buffer->smb_buf_length += count;
-	pSMB->req.ByteCount = cpu_to_le16(count);
-
-	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-			 &bytes_returned, CIFS_LONG_OP);
-
-	if (smb_buffer_response->Status.CifsError ==
-	    cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
-		rc = 0;
-
-	if (rc) {
-/*    rc = map_smb_to_linux_error(smb_buffer_response);  *//* done in SendReceive now */
-	} else if ((smb_buffer_response->WordCount == 3)
-		   || (smb_buffer_response->WordCount == 4)) {
-		__u16 action = le16_to_cpu(pSMBr->resp.Action);
-		__u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
-
-		if (action & GUEST_LOGIN)
-			cFYI(1, ("Guest login"));
-	/* Do we want to set anything in SesInfo struct when guest login? */
-
-		bcc_ptr = pByteArea(smb_buffer_response);
-	/* response can have either 3 or 4 word count - Samba sends 3 */
-
-		SecurityBlob2 = (PCHALLENGE_MESSAGE) bcc_ptr;
-		if (SecurityBlob2->MessageType != NtLmChallenge) {
-			cFYI(1, ("Unexpected NTLMSSP message type received %d",
-			      SecurityBlob2->MessageType));
-		} else if (ses) {
-			ses->Suid = smb_buffer_response->Uid; /* UID left in le format */
-			cFYI(1, ("UID = %d", ses->Suid));
-			if ((pSMBr->resp.hdr.WordCount == 3)
-			    || ((pSMBr->resp.hdr.WordCount == 4)
-				&& (blob_len <
-				    pSMBr->resp.ByteCount))) {
-
-				if (pSMBr->resp.hdr.WordCount == 4) {
-					bcc_ptr += blob_len;
-					cFYI(1, ("Security Blob Length %d",
-					      blob_len));
-				}
-
-				cFYI(1, ("NTLMSSP Challenge rcvd"));
-
-				memcpy(ses->server->cryptKey,
-				       SecurityBlob2->Challenge,
-				       CIFS_CRYPTO_KEY_SIZE);
-				if (SecurityBlob2->NegotiateFlags &
-					cpu_to_le32(NTLMSSP_NEGOTIATE_NTLMV2))
-					*pNTLMv2_flag = true;
-
-				if ((SecurityBlob2->NegotiateFlags &
-					cpu_to_le32(NTLMSSP_NEGOTIATE_ALWAYS_SIGN))
-					|| (sign_CIFS_PDUs > 1))
-						ses->server->secMode |=
-							SECMODE_SIGN_REQUIRED;
-				if ((SecurityBlob2->NegotiateFlags &
-					cpu_to_le32(NTLMSSP_NEGOTIATE_SIGN)) && (sign_CIFS_PDUs))
-						ses->server->secMode |=
-							SECMODE_SIGN_ENABLED;
-
-				if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-					if ((long) (bcc_ptr) % 2) {
-						remaining_words =
-						    (BCC(smb_buffer_response)
-						     - 1) / 2;
-					 /* Must word align unicode strings */
-						bcc_ptr++;
-					} else {
-						remaining_words =
-						    BCC
-						    (smb_buffer_response) / 2;
-					}
-					len =
-					    UniStrnlen((wchar_t *) bcc_ptr,
-						       remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-					if (ses->serverOS)
-						kfree(ses->serverOS);
-					ses->serverOS =
-					    kzalloc(2 * (len + 1), GFP_KERNEL);
-					cifs_strfromUCS_le(ses->serverOS,
-							   (__le16 *)
-							   bcc_ptr, len,
-							   nls_codepage);
-					bcc_ptr += 2 * (len + 1);
-					remaining_words -= len + 1;
-					ses->serverOS[2 * len] = 0;
-					ses->serverOS[1 + (2 * len)] = 0;
-					if (remaining_words > 0) {
-						len = UniStrnlen((wchar_t *)
-								 bcc_ptr,
-								 remaining_words
-								 - 1);
-						kfree(ses->serverNOS);
-						ses->serverNOS =
-						    kzalloc(2 * (len + 1),
-							    GFP_KERNEL);
-						cifs_strfromUCS_le(ses->
-								   serverNOS,
-								   (__le16 *)
-								   bcc_ptr,
-								   len,
-								   nls_codepage);
-						bcc_ptr += 2 * (len + 1);
-						ses->serverNOS[2 * len] = 0;
-						ses->serverNOS[1 +
-							       (2 * len)] = 0;
-						remaining_words -= len + 1;
-						if (remaining_words > 0) {
-							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
-				/* last string not always null terminated
-				   (for e.g. for Windows XP & 2000) */
-							kfree(ses->serverDomain);
-							ses->serverDomain =
-							    kzalloc(2 *
-								    (len +
-								     1),
-								    GFP_KERNEL);
-							cifs_strfromUCS_le
-							    (ses->serverDomain,
-							     (__le16 *)bcc_ptr,
-							     len, nls_codepage);
-							bcc_ptr +=
-							    2 * (len + 1);
-							ses->serverDomain[2*len]
-							    = 0;
-							ses->serverDomain
-								[1 + (2 * len)]
-							    = 0;
-						} /* else no more room so create dummy domain string */
-						else {
-							kfree(ses->serverDomain);
-							ses->serverDomain =
-							    kzalloc(2,
-								    GFP_KERNEL);
-						}
-					} else {	/* no room so create dummy domain and NOS string */
-						kfree(ses->serverDomain);
-						ses->serverDomain =
-						    kzalloc(2, GFP_KERNEL);
-						kfree(ses->serverNOS);
-						ses->serverNOS =
-						    kzalloc(2, GFP_KERNEL);
-					}
-				} else {	/* ASCII */
-					len = strnlen(bcc_ptr, 1024);
-					if (((long) bcc_ptr + len) - (long)
-					    pByteArea(smb_buffer_response)
-					    <= BCC(smb_buffer_response)) {
-						if (ses->serverOS)
-							kfree(ses->serverOS);
-						ses->serverOS =
-						    kzalloc(len + 1,
-							    GFP_KERNEL);
-						strncpy(ses->serverOS,
-							bcc_ptr, len);
-
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;	/* null terminate string */
-						bcc_ptr++;
-
-						len = strnlen(bcc_ptr, 1024);
-						kfree(ses->serverNOS);
-						ses->serverNOS =
-						    kzalloc(len + 1,
-							    GFP_KERNEL);
-						strncpy(ses->serverNOS, bcc_ptr, len);
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;
-						bcc_ptr++;
-
-						len = strnlen(bcc_ptr, 1024);
-						kfree(ses->serverDomain);
-						ses->serverDomain =
-						    kzalloc(len + 1,
-							    GFP_KERNEL);
-						strncpy(ses->serverDomain,
-							bcc_ptr, len);
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;
-						bcc_ptr++;
-					} else
-						cFYI(1,
-						     ("field of length %d "
-						    "extends beyond end of smb",
-						      len));
-				}
-			} else {
-				cERROR(1, ("Security Blob Length extends beyond"
-					   " end of SMB"));
-			}
-		} else {
-			cERROR(1, ("No session structure passed in."));
+		cFYI(1, ("Getting referral for: %s", full_path));
+		rc = get_dfs_path(xid, pSesInfo , full_path + 1,
+			cifs_sb->local_nls, &num_referrals, &referrals,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (!rc && num_referrals > 0) {
+			char *fake_devname = NULL;
+
+			if (mount_data != mount_data_global)
+				kfree(mount_data);
+			mount_data = cifs_compose_mount_options(
+					cifs_sb->mountdata, full_path + 1,
+					referrals, &fake_devname);
+			kfree(fake_devname);
+			free_dfs_info_array(referrals, num_referrals);
+
+			if (tcon)
+				cifs_put_tcon(tcon);
+			else if (pSesInfo)
+				cifs_put_smb_ses(pSesInfo);
+
+			cleanup_volume_info(&volume_info);
+			FreeXid(xid);
+			kfree(full_path);
+			referral_walks_count++;
+			goto try_mount_again;
 		}
-	} else {
-		cERROR(1, ("Invalid Word count %d:",
-			smb_buffer_response->WordCount));
-		rc = -EIO;
-	}
-
-	cifs_buf_release(smb_buffer);
-
-	return rc;
-}
-static int
-CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-			char *ntlm_session_key, bool ntlmv2_flag,
-			const struct nls_table *nls_codepage)
-{
-	struct smb_hdr *smb_buffer;
-	struct smb_hdr *smb_buffer_response;
-	SESSION_SETUP_ANDX *pSMB;
-	SESSION_SETUP_ANDX *pSMBr;
-	char *bcc_ptr;
-	char *user;
-	char *domain;
-	int rc = 0;
-	int remaining_words = 0;
-	int bytes_returned = 0;
-	int len;
-	int SecurityBlobLength = sizeof(AUTHENTICATE_MESSAGE);
-	PAUTHENTICATE_MESSAGE SecurityBlob;
-	__u32 negotiate_flags, capabilities;
-	__u16 count;
-
-	cFYI(1, ("In NTLMSSPSessSetup (Authenticate)"));
-	if (ses == NULL)
-		return -EINVAL;
-	user = ses->userName;
-	domain = ses->domainName;
-	smb_buffer = cifs_buf_get();
-	if (smb_buffer == NULL) {
-		return -ENOMEM;
-	}
-	smb_buffer_response = smb_buffer;
-	pSMB = (SESSION_SETUP_ANDX *)smb_buffer;
-	pSMBr = (SESSION_SETUP_ANDX *)smb_buffer_response;
-
-	/* send SMBsessionSetup here */
-	header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-			NULL /* no tCon exists yet */ , 12 /* wct */ );
-
-	smb_buffer->Mid = GetNextMid(ses->server);
-	pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
-	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	pSMB->req.AndXCommand = 0xFF;
-	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-
-	pSMB->req.hdr.Uid = ses->Suid;
-
-	if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
-	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-			CAP_EXTENDED_SECURITY;
-	if (ses->capabilities & CAP_UNICODE) {
-		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-		capabilities |= CAP_UNICODE;
-	}
-	if (ses->capabilities & CAP_STATUS32) {
-		smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-		capabilities |= CAP_STATUS32;
+#else /* No DFS support, return error on mount */
+		rc = -EOPNOTSUPP;
+#endif
 	}
-	if (ses->capabilities & CAP_DFS) {
-		smb_buffer->Flags2 |= SMBFLG2_DFS;
-		capabilities |= CAP_DFS;
-	}
-	pSMB->req.Capabilities = cpu_to_le32(capabilities);
-
-	bcc_ptr = (char *)&pSMB->req.SecurityBlob;
-	SecurityBlob = (PAUTHENTICATE_MESSAGE)bcc_ptr;
-	strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
-	SecurityBlob->MessageType = NtLmAuthenticate;
-	bcc_ptr += SecurityBlobLength;
-	negotiate_flags = NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_REQUEST_TARGET |
-			NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_TARGET_INFO |
-			0x80000000 | NTLMSSP_NEGOTIATE_128;
-	if (sign_CIFS_PDUs)
-		negotiate_flags |= /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN |*/ NTLMSSP_NEGOTIATE_SIGN;
-	if (ntlmv2_flag)
-		negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;
-
-/* setup pointers to domain name and workstation name */
-
-	SecurityBlob->WorkstationName.Buffer = 0;
-	SecurityBlob->WorkstationName.Length = 0;
-	SecurityBlob->WorkstationName.MaximumLength = 0;
-	SecurityBlob->SessionKey.Length = 0;
-	SecurityBlob->SessionKey.MaximumLength = 0;
-	SecurityBlob->SessionKey.Buffer = 0;
-
-	SecurityBlob->LmChallengeResponse.Length = 0;
-	SecurityBlob->LmChallengeResponse.MaximumLength = 0;
-	SecurityBlob->LmChallengeResponse.Buffer = 0;
-
-	SecurityBlob->NtChallengeResponse.Length =
-	    cpu_to_le16(CIFS_SESS_KEY_SIZE);
-	SecurityBlob->NtChallengeResponse.MaximumLength =
-	    cpu_to_le16(CIFS_SESS_KEY_SIZE);
-	memcpy(bcc_ptr, ntlm_session_key, CIFS_SESS_KEY_SIZE);
-	SecurityBlob->NtChallengeResponse.Buffer =
-	    cpu_to_le32(SecurityBlobLength);
-	SecurityBlobLength += CIFS_SESS_KEY_SIZE;
-	bcc_ptr += CIFS_SESS_KEY_SIZE;
 
-	if (ses->capabilities & CAP_UNICODE) {
-		if (domain == NULL) {
-			SecurityBlob->DomainName.Buffer = 0;
-			SecurityBlob->DomainName.Length = 0;
-			SecurityBlob->DomainName.MaximumLength = 0;
-		} else {
-			__u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
-					  nls_codepage);
-			ln *= 2;
-			SecurityBlob->DomainName.MaximumLength =
-			    cpu_to_le16(ln);
-			SecurityBlob->DomainName.Buffer =
-			    cpu_to_le32(SecurityBlobLength);
-			bcc_ptr += ln;
-			SecurityBlobLength += ln;
-			SecurityBlob->DomainName.Length = cpu_to_le16(ln);
-		}
-		if (user == NULL) {
-			SecurityBlob->UserName.Buffer = 0;
-			SecurityBlob->UserName.Length = 0;
-			SecurityBlob->UserName.MaximumLength = 0;
-		} else {
-			__u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, user, 64,
-					  nls_codepage);
-			ln *= 2;
-			SecurityBlob->UserName.MaximumLength =
-			    cpu_to_le16(ln);
-			SecurityBlob->UserName.Buffer =
-			    cpu_to_le32(SecurityBlobLength);
-			bcc_ptr += ln;
-			SecurityBlobLength += ln;
-			SecurityBlob->UserName.Length = cpu_to_le16(ln);
-		}
-
-		/* SecurityBlob->WorkstationName.Length =
-		 cifs_strtoUCS((__le16 *) bcc_ptr, "AMACHINE",64, nls_codepage);
-		   SecurityBlob->WorkstationName.Length *= 2;
-		   SecurityBlob->WorkstationName.MaximumLength =
-			cpu_to_le16(SecurityBlob->WorkstationName.Length);
-		   SecurityBlob->WorkstationName.Buffer =
-				 cpu_to_le32(SecurityBlobLength);
-		   bcc_ptr += SecurityBlob->WorkstationName.Length;
-		   SecurityBlobLength += SecurityBlob->WorkstationName.Length;
-		   SecurityBlob->WorkstationName.Length =
-			cpu_to_le16(SecurityBlob->WorkstationName.Length);  */
-
-		if ((long) bcc_ptr % 2) {
-			*bcc_ptr = 0;
-			bcc_ptr++;
-		}
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
-				  32, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
-				  nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;	/* null term version string */
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-				  64, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		*(bcc_ptr + 1) = 0;
-		*(bcc_ptr + 2) = 0;
-		bcc_ptr += 2;	/* null terminate network opsys string */
-		*(bcc_ptr + 1) = 0;
-		*(bcc_ptr + 2) = 0;
-		bcc_ptr += 2;	/* null domain */
-	} else {		/* ASCII */
-		if (domain == NULL) {
-			SecurityBlob->DomainName.Buffer = 0;
-			SecurityBlob->DomainName.Length = 0;
-			SecurityBlob->DomainName.MaximumLength = 0;
-		} else {
-			__u16 ln;
-			negotiate_flags |= NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED;
-			strncpy(bcc_ptr, domain, 63);
-			ln = strnlen(domain, 64);
-			SecurityBlob->DomainName.MaximumLength =
-			    cpu_to_le16(ln);
-			SecurityBlob->DomainName.Buffer =
-			    cpu_to_le32(SecurityBlobLength);
-			bcc_ptr += ln;
-			SecurityBlobLength += ln;
-			SecurityBlob->DomainName.Length = cpu_to_le16(ln);
-		}
-		if (user == NULL) {
-			SecurityBlob->UserName.Buffer = 0;
-			SecurityBlob->UserName.Length = 0;
-			SecurityBlob->UserName.MaximumLength = 0;
-		} else {
-			__u16 ln;
-			strncpy(bcc_ptr, user, 63);
-			ln = strnlen(user, 64);
-			SecurityBlob->UserName.MaximumLength = cpu_to_le16(ln);
-			SecurityBlob->UserName.Buffer =
-						cpu_to_le32(SecurityBlobLength);
-			bcc_ptr += ln;
-			SecurityBlobLength += ln;
-			SecurityBlob->UserName.Length = cpu_to_le16(ln);
-		}
-		/* BB fill in our workstation name if known BB */
-
-		strcpy(bcc_ptr, "Linux version ");
-		bcc_ptr += strlen("Linux version ");
-		strcpy(bcc_ptr, utsname()->release);
-		bcc_ptr += strlen(utsname()->release) + 1;
-		strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
-		bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-		bcc_ptr++;	/* null domain */
-		*bcc_ptr = 0;
-	}
-	SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
-	pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
-	count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-	smb_buffer->smb_buf_length += count;
-	pSMB->req.ByteCount = cpu_to_le16(count);
-
-	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-			 &bytes_returned, CIFS_LONG_OP);
+mount_fail_check:
+	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
-/*   rc = map_smb_to_linux_error(smb_buffer_response) done in SendReceive now */
-	} else if ((smb_buffer_response->WordCount == 3) ||
-		   (smb_buffer_response->WordCount == 4)) {
-		__u16 action = le16_to_cpu(pSMBr->resp.Action);
-		__u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
-		if (action & GUEST_LOGIN)
-			cFYI(1, ("Guest login")); /* BB Should we set anything
-							 in SesInfo struct ? */
-/*		if (SecurityBlob2->MessageType != NtLm??) {
-			cFYI("Unexpected message type on auth response is %d"));
-		} */
-
-		if (ses) {
-			cFYI(1,
-			     ("Check challenge UID %d vs auth response UID %d",
-			      ses->Suid, smb_buffer_response->Uid));
-			/* UID left in wire format */
-			ses->Suid = smb_buffer_response->Uid;
-			bcc_ptr = pByteArea(smb_buffer_response);
-		/* response can have either 3 or 4 word count - Samba sends 3 */
-			if ((pSMBr->resp.hdr.WordCount == 3)
-			    || ((pSMBr->resp.hdr.WordCount == 4)
-				&& (blob_len <
-				    pSMBr->resp.ByteCount))) {
-				if (pSMBr->resp.hdr.WordCount == 4) {
-					bcc_ptr +=
-					    blob_len;
-					cFYI(1,
-					     ("Security Blob Length %d ",
-					      blob_len));
-				}
-
-				cFYI(1,
-				     ("NTLMSSP response to Authenticate "));
-
-				if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-					if ((long) (bcc_ptr) % 2) {
-						remaining_words =
-						    (BCC(smb_buffer_response)
-						     - 1) / 2;
-						bcc_ptr++;	/* Unicode strings must be word aligned */
-					} else {
-						remaining_words = BCC(smb_buffer_response) / 2;
-					}
-					len = UniStrnlen((wchar_t *) bcc_ptr,
-							remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-  the end since (at least) WIN2K and Windows XP have a major bug in not null
-  terminating last Unicode string in response  */
-					if (ses->serverOS)
-						kfree(ses->serverOS);
-					ses->serverOS =
-					    kzalloc(2 * (len + 1), GFP_KERNEL);
-					cifs_strfromUCS_le(ses->serverOS,
-							   (__le16 *)
-							   bcc_ptr, len,
-							   nls_codepage);
-					bcc_ptr += 2 * (len + 1);
-					remaining_words -= len + 1;
-					ses->serverOS[2 * len] = 0;
-					ses->serverOS[1 + (2 * len)] = 0;
-					if (remaining_words > 0) {
-						len = UniStrnlen((wchar_t *)
-								 bcc_ptr,
-								 remaining_words
-								 - 1);
-						kfree(ses->serverNOS);
-						ses->serverNOS =
-						    kzalloc(2 * (len + 1),
-							    GFP_KERNEL);
-						cifs_strfromUCS_le(ses->
-								   serverNOS,
-								   (__le16 *)
-								   bcc_ptr,
-								   len,
-								   nls_codepage);
-						bcc_ptr += 2 * (len + 1);
-						ses->serverNOS[2 * len] = 0;
-						ses->serverNOS[1+(2*len)] = 0;
-						remaining_words -= len + 1;
-						if (remaining_words > 0) {
-							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
-     /* last string not always null terminated (e.g. for Windows XP & 2000) */
-							if (ses->serverDomain)
-								kfree(ses->serverDomain);
-							ses->serverDomain =
-							    kzalloc(2 *
-								    (len +
-								     1),
-								    GFP_KERNEL);
-							cifs_strfromUCS_le
-							    (ses->
-							     serverDomain,
-							     (__le16 *)
-							     bcc_ptr, len,
-							     nls_codepage);
-							bcc_ptr +=
-							    2 * (len + 1);
-							ses->
-							    serverDomain[2
-									 * len]
-							    = 0;
-							ses->
-							    serverDomain[1
-									 +
-									 (2
-									  *
-									  len)]
-							    = 0;
-						} /* else no more room so create dummy domain string */
-						else {
-							if (ses->serverDomain)
-								kfree(ses->serverDomain);
-							ses->serverDomain = kzalloc(2,GFP_KERNEL);
-						}
-					} else {  /* no room so create dummy domain and NOS string */
-						if (ses->serverDomain)
-							kfree(ses->serverDomain);
-						ses->serverDomain = kzalloc(2, GFP_KERNEL);
-						kfree(ses->serverNOS);
-						ses->serverNOS = kzalloc(2, GFP_KERNEL);
-					}
-				} else {	/* ASCII */
-					len = strnlen(bcc_ptr, 1024);
-					if (((long) bcc_ptr + len) -
-					   (long) pByteArea(smb_buffer_response)
-						<= BCC(smb_buffer_response)) {
-						if (ses->serverOS)
-							kfree(ses->serverOS);
-						ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
-						strncpy(ses->serverOS,bcc_ptr, len);
-
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;	/* null terminate the string */
-						bcc_ptr++;
-
-						len = strnlen(bcc_ptr, 1024);
-						kfree(ses->serverNOS);
-						ses->serverNOS = kzalloc(len+1,
-								    GFP_KERNEL);
-						strncpy(ses->serverNOS,
-							bcc_ptr, len);
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;
-						bcc_ptr++;
-
-						len = strnlen(bcc_ptr, 1024);
-						if (ses->serverDomain)
-							kfree(ses->serverDomain);
-						ses->serverDomain =
-								kzalloc(len+1,
-								    GFP_KERNEL);
-						strncpy(ses->serverDomain,
-							bcc_ptr, len);
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;
-						bcc_ptr++;
-					} else
-						cFYI(1, ("field of length %d "
-						   "extends beyond end of smb ",
-						      len));
-				}
-			} else {
-				cERROR(1, ("Security Blob extends beyond end "
-					"of SMB"));
-			}
-		} else {
-			cERROR(1, ("No session structure passed in."));
-		}
-	} else {
-		cERROR(1, ("Invalid Word count %d: ",
-			smb_buffer_response->WordCount));
-		rc = -EIO;
+		if (mount_data != mount_data_global)
+			kfree(mount_data);
+		/* If find_unc succeeded then rc == 0 so we can not end */
+		/* up accidently freeing someone elses tcon struct */
+		if (tcon)
+			cifs_put_tcon(tcon);
+		else if (pSesInfo)
+			cifs_put_smb_ses(pSesInfo);
+		else
+			cifs_put_tcp_session(srvTcp);
+		goto out;
 	}
 
-	cifs_buf_release(smb_buffer);
-
+	/* volume_info->password is freed above when existing session found
+	(in which case it is not needed anymore) but when new sesion is created
+	the password ptr is put in the new session structure (in which case the
+	password will be freed at unmount time) */
+out:
+	/* zero out password before freeing */
+	cleanup_volume_info(&volume_info);
+	FreeXid(xid);
 	return rc;
 }
 
@@ -3556,7 +2623,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	TCONX_RSP *pSMBr;
 	unsigned char *bcc_ptr;
 	int rc = 0;
-	int length;
+	int length, bytes_left;
 	__u16 count;
 
 	if (ses == NULL)
@@ -3644,14 +2711,22 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
 			 CIFS_STD_OP);
 
-	/* if (rc) rc = map_smb_to_linux_error(smb_buffer_response); */
 	/* above now done in SendReceive */
 	if ((rc == 0) && (tcon != NULL)) {
+		bool is_unicode;
+
 		tcon->tidStatus = CifsGood;
 		tcon->need_reconnect = false;
 		tcon->tid = smb_buffer_response->Tid;
 		bcc_ptr = pByteArea(smb_buffer_response);
-		length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2);
+		bytes_left = BCC(smb_buffer_response);
+		length = strnlen(bcc_ptr, bytes_left - 2);
+		if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
+			is_unicode = true;
+		else
+			is_unicode = false;
+
+
 		/* skip service field (NB: this field is always ASCII) */
 		if (length == 3) {
 			if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
@@ -3666,40 +2741,16 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			}
 		}
 		bcc_ptr += length + 1;
+		bytes_left -= (length + 1);
 		strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
-		if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-			length = UniStrnlen((wchar_t *) bcc_ptr, 512);
-			if ((bcc_ptr + (2 * length)) -
-			     pByteArea(smb_buffer_response) <=
-			    BCC(smb_buffer_response)) {
-				kfree(tcon->nativeFileSystem);
-				tcon->nativeFileSystem =
-				    kzalloc(2*(length + 1), GFP_KERNEL);
-				if (tcon->nativeFileSystem)
-					cifs_strfromUCS_le(
-						tcon->nativeFileSystem,
-						(__le16 *) bcc_ptr,
-						length, nls_codepage);
-				bcc_ptr += 2 * length;
-				bcc_ptr[0] = 0;	/* null terminate the string */
-				bcc_ptr[1] = 0;
-				bcc_ptr += 2;
-			}
-			/* else do not bother copying these information fields*/
-		} else {
-			length = strnlen(bcc_ptr, 1024);
-			if ((bcc_ptr + length) -
-			    pByteArea(smb_buffer_response) <=
-			    BCC(smb_buffer_response)) {
-				kfree(tcon->nativeFileSystem);
-				tcon->nativeFileSystem =
-				    kzalloc(length + 1, GFP_KERNEL);
-				if (tcon->nativeFileSystem)
-					strncpy(tcon->nativeFileSystem, bcc_ptr,
-						length);
-			}
-			/* else do not bother copying these information fields*/
-		}
+
+		/* mostly informational -- no need to fail on error here */
+		tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+						      bytes_left, is_unicode,
+						      nls_codepage);
+
+		cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
+
 		if ((smb_buffer_response->WordCount == 3) ||
 			 (smb_buffer_response->WordCount == 7))
 			/* field is in same location */
@@ -3738,8 +2789,6 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 					   struct nls_table *nls_info)
 {
 	int rc = 0;
-	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
-	bool ntlmv2_flag = false;
 	int first_time = 0;
 	struct TCP_Server_Info *server = pSesInfo->server;
 
@@ -3771,83 +2820,19 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 	pSesInfo->capabilities = server->capabilities;
 	if (linuxExtEnabled == 0)
 		pSesInfo->capabilities &= (~CAP_UNIX);
-	/*	pSesInfo->sequence_number = 0;*/
+
 	cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
 		 server->secMode, server->capabilities, server->timeAdj));
 
-	if (experimEnabled < 2)
-		rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
-	else if (extended_security
-			&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-			&& (server->secType == NTLMSSP)) {
-		rc = -EOPNOTSUPP;
-	} else if (extended_security
-			&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-			&& (server->secType == RawNTLMSSP)) {
-		cFYI(1, ("NTLMSSP sesssetup"));
-		rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
-						   nls_info);
-		if (!rc) {
-			if (ntlmv2_flag) {
-				char *v2_response;
-				cFYI(1, ("more secure NTLM ver2 hash"));
-				if (CalcNTLMv2_partial_mac_key(pSesInfo,
-								nls_info)) {
-					rc = -ENOMEM;
-					goto ss_err_exit;
-				} else
-					v2_response = kmalloc(16 + 64 /* blob*/,
-								GFP_KERNEL);
-				if (v2_response) {
-					CalcNTLMv2_response(pSesInfo,
-								v2_response);
-				/*	if (first_time)
-						cifs_calculate_ntlmv2_mac_key */
-					kfree(v2_response);
-					/* BB Put dummy sig in SessSetup PDU? */
-				} else {
-					rc = -ENOMEM;
-					goto ss_err_exit;
-				}
-
-			} else {
-				SMBNTencrypt(pSesInfo->password,
-					     server->cryptKey,
-					     ntlm_session_key);
-
-				if (first_time)
-					cifs_calculate_mac_key(
-					     &server->mac_signing_key,
-					     ntlm_session_key,
-					     pSesInfo->password);
-			}
-			/* for better security the weaker lanman hash not sent
-			   in AuthSessSetup so we no longer calculate it */
-
-			rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo,
-						      ntlm_session_key,
-						      ntlmv2_flag,
-						      nls_info);
-		}
-	} else { /* old style NTLM 0.12 session setup */
-		SMBNTencrypt(pSesInfo->password, server->cryptKey,
-			     ntlm_session_key);
-
-		if (first_time)
-			cifs_calculate_mac_key(&server->mac_signing_key,
-						ntlm_session_key,
-						pSesInfo->password);
-
-		rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
-	}
+	rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
 	if (rc) {
 		cERROR(1, ("Send error in SessSetup = %d", rc));
 	} else {
 		cFYI(1, ("CIFS Session Established successfully"));
-			spin_lock(&GlobalMid_Lock);
-			pSesInfo->status = CifsGood;
-			pSesInfo->need_reconnect = false;
-			spin_unlock(&GlobalMid_Lock);
+		spin_lock(&GlobalMid_Lock);
+		pSesInfo->status = CifsGood;
+		pSesInfo->need_reconnect = false;
+		spin_unlock(&GlobalMid_Lock);
 	}
 
 ss_err_exit:
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 54dce78fbb73..11431ed72a7f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,12 +129,62 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+static void
+cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
+			struct cifsTconInfo *tcon, bool write_only)
+{
+	int oplock = 0;
+	struct cifsFileInfo *pCifsFile;
+	struct cifsInodeInfo *pCifsInode;
+
+	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+
+	if (pCifsFile == NULL)
+		return;
+
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
+
+	pCifsFile->netfid = fileHandle;
+	pCifsFile->pid = current->tgid;
+	pCifsFile->pInode = newinode;
+	pCifsFile->invalidHandle = false;
+	pCifsFile->closePend = false;
+	mutex_init(&pCifsFile->fh_mutex);
+	mutex_init(&pCifsFile->lock_mutex);
+	INIT_LIST_HEAD(&pCifsFile->llist);
+	atomic_set(&pCifsFile->wrtPending, 0);
+
+	/* set the following in open now
+			pCifsFile->pfile = file; */
+	write_lock(&GlobalSMBSeslock);
+	list_add(&pCifsFile->tlist, &tcon->openFileList);
+	pCifsInode = CIFS_I(newinode);
+	if (pCifsInode) {
+		/* if readable file instance put first in list*/
+		if (write_only)
+			list_add_tail(&pCifsFile->flist,
+				      &pCifsInode->openFileList);
+		else
+			list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+
+		if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+			pCifsInode->clientCanCacheAll = true;
+			pCifsInode->clientCanCacheRead = true;
+			cFYI(1, ("Exclusive Oplock inode %p", newinode));
+		} else if ((oplock & 0xF) == OPLOCK_READ)
+				pCifsInode->clientCanCacheRead = true;
+	}
+	write_unlock(&GlobalSMBSeslock);
+}
+
 int cifs_posix_open(char *full_path, struct inode **pinode,
 		    struct super_block *sb, int mode, int oflags,
 		    int *poplock, __u16 *pnetfid, int xid)
 {
 	int rc;
 	__u32 oplock;
+	bool write_only = false;
 	FILE_UNIX_BASIC_INFO *presp_data;
 	__u32 posix_flags = 0;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -172,6 +222,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	if (oflags & O_DIRECT)
 		posix_flags |= SMB_O_DIRECT;
 
+	if (!(oflags & FMODE_READ))
+		write_only = true;
 
 	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
 			pnetfid, presp_data, &oplock, full_path,
@@ -187,8 +239,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	if (!pinode)
 		goto posix_open_ret; /* caller does not need info */
 
-	if (*pinode == NULL)
-		*pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+	if (*pinode == NULL) {
+		__u64 unique_id = le64_to_cpu(presp_data->UniqueId);
+		*pinode = cifs_new_inode(sb, &unique_id);
+	}
 	/* else an inode was passed in. Update its info, don't create one */
 
 	/* We do not need to close the file if new_inode fails since
@@ -198,6 +252,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 
 	posix_fill_in_inode(*pinode, presp_data, 1);
 
+	cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
+
 posix_open_ret:
 	kfree(presp_data);
 	return rc;
@@ -225,6 +281,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
 	int oflags;
+	bool posix_create = false;
 	/*
 	 * BB below access is probably too much for mknod to request
 	 *    but we have to do query and setpathinfo so requesting
@@ -239,7 +296,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	char *full_path = NULL;
 	FILE_ALL_INFO *buf = NULL;
 	struct inode *newinode = NULL;
-	struct cifsInodeInfo *pCifsInode;
 	int disposition = FILE_OVERWRITE_IF;
 	bool write_only = false;
 
@@ -273,11 +329,13 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		   negotation.  EREMOTE indicates DFS junction, which is not
 		   handled in posix open */
 
-		if ((rc == 0) && (newinode == NULL))
-			goto cifs_create_get_file_info; /* query inode info */
-		else if (rc == 0) /* success, no need to query */
-			goto cifs_create_set_dentry;
-		else if ((rc != -EIO) && (rc != -EREMOTE) &&
+		if (rc == 0) {
+			posix_create = true;
+			if (newinode == NULL) /* query inode info */
+				goto cifs_create_get_file_info;
+			else /* success, no need to query */
+				goto cifs_create_set_dentry;
+		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
 			 (rc != -EOPNOTSUPP)) /* path not found or net err */
 			goto cifs_create_out;
 		/* else fallthrough to retry, using older open call, this is
@@ -409,45 +467,9 @@ cifs_create_set_dentry:
 	if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
 		/* mknod case - do not leave file open */
 		CIFSSMBClose(xid, tcon, fileHandle);
-	} else if (newinode) {
-		struct cifsFileInfo *pCifsFile =
-			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-
-		if (pCifsFile == NULL)
-			goto cifs_create_out;
-		pCifsFile->netfid = fileHandle;
-		pCifsFile->pid = current->tgid;
-		pCifsFile->pInode = newinode;
-		pCifsFile->invalidHandle = false;
-		pCifsFile->closePend     = false;
-		init_MUTEX(&pCifsFile->fh_sem);
-		mutex_init(&pCifsFile->lock_mutex);
-		INIT_LIST_HEAD(&pCifsFile->llist);
-		atomic_set(&pCifsFile->wrtPending, 0);
-
-		/* set the following in open now
-				pCifsFile->pfile = file; */
-		write_lock(&GlobalSMBSeslock);
-		list_add(&pCifsFile->tlist, &tcon->openFileList);
-		pCifsInode = CIFS_I(newinode);
-		if (pCifsInode) {
-			/* if readable file instance put first in list*/
-			if (write_only) {
-				list_add_tail(&pCifsFile->flist,
-					      &pCifsInode->openFileList);
-			} else {
-				list_add(&pCifsFile->flist,
-					 &pCifsInode->openFileList);
-			}
-			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-				pCifsInode->clientCanCacheAll = true;
-				pCifsInode->clientCanCacheRead = true;
-				cFYI(1, ("Exclusive Oplock inode %p",
-					newinode));
-			} else if ((oplock & 0xF) == OPLOCK_READ)
-				pCifsInode->clientCanCacheRead = true;
-		}
-		write_unlock(&GlobalSMBSeslock);
+	} else if (!(posix_create) && (newinode)) {
+			cifs_fill_fileinfo(newinode, fileHandle,
+					cifs_sb->tcon, write_only);
 	}
 cifs_create_out:
 	kfree(buf);
@@ -580,17 +602,21 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	return rc;
 }
 
-
 struct dentry *
 cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	    struct nameidata *nd)
 {
 	int xid;
 	int rc = 0; /* to get around spurious gcc warning, set to zero here */
+	int oplock = 0;
+	int mode;
+	__u16 fileHandle = 0;
+	bool posix_open = false;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	struct inode *newInode = NULL;
 	char *full_path = NULL;
+	struct file *filp;
 
 	xid = GetXid();
 
@@ -632,12 +658,37 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	}
 	cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
 
-	if (pTcon->unix_ext)
-		rc = cifs_get_inode_info_unix(&newInode, full_path,
-					      parent_dir_inode->i_sb, xid);
-	else
+	if (pTcon->unix_ext) {
+		if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
+				(nd->flags & LOOKUP_OPEN)) {
+			if (!((nd->intent.open.flags & O_CREAT) &&
+					(nd->intent.open.flags & O_EXCL))) {
+				mode = nd->intent.open.create_mode &
+						~current_umask();
+				rc = cifs_posix_open(full_path, &newInode,
+					parent_dir_inode->i_sb, mode,
+					nd->intent.open.flags, &oplock,
+					&fileHandle, xid);
+				/*
+				 * This code works around a bug in
+				 * samba posix open in samba versions 3.3.1
+				 * and earlier where create works
+				 * but open fails with invalid parameter.
+				 * If either of these error codes are
+				 * returned, follow the normal lookup.
+				 * Otherwise, the error during posix open
+				 * is handled.
+				 */
+				if ((rc != -EINVAL) && (rc != -EOPNOTSUPP))
+					posix_open = true;
+			}
+		}
+		if (!posix_open)
+			rc = cifs_get_inode_info_unix(&newInode, full_path,
+						parent_dir_inode->i_sb, xid);
+	} else
 		rc = cifs_get_inode_info(&newInode, full_path, NULL,
-					 parent_dir_inode->i_sb, xid, NULL);
+				parent_dir_inode->i_sb, xid, NULL);
 
 	if ((rc == 0) && (newInode != NULL)) {
 		if (pTcon->nocase)
@@ -645,7 +696,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		else
 			direntry->d_op = &cifs_dentry_ops;
 		d_add(direntry, newInode);
-
+		if (posix_open)
+			filp = lookup_instantiate_filp(nd, direntry, NULL);
 		/* since paths are not looked up by component - the parent
 		   directories are presumed to be good here */
 		renew_parental_timestamps(direntry);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1e0c1bd8f2e4..df4a306f697e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -78,7 +78,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
 	}
 
 	key->type_data.x[0] = datalen;
-	rcu_assign_pointer(key->payload.data, ip);
+	key->payload.data = ip;
 
 	return rc;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 81747acca4c4..38c06f826575 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -46,7 +46,7 @@ static inline struct cifsFileInfo *cifs_init_private(
 	memset(private_data, 0, sizeof(struct cifsFileInfo));
 	private_data->netfid = netfid;
 	private_data->pid = current->tgid;
-	init_MUTEX(&private_data->fh_sem);
+	mutex_init(&private_data->fh_mutex);
 	mutex_init(&private_data->lock_mutex);
 	INIT_LIST_HEAD(&private_data->llist);
 	private_data->pfile = file; /* needed for writepage */
@@ -129,15 +129,12 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
 			struct file *file, struct cifsInodeInfo *pCifsInode,
 			struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
 {
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-/*	struct timespec temp; */   /* BB REMOVEME BB */
 
 	file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
 	if (file->private_data == NULL)
 		return -ENOMEM;
 	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
 	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
 	if (pCifsInode == NULL) {
@@ -145,17 +142,6 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
 		return -EINVAL;
 	}
 
-	/* want handles we can use to read with first
-	   in the list so we do not have to walk the
-	   list to search for one in write_begin */
-	if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
-		list_add_tail(&pCifsFile->flist,
-			      &pCifsInode->openFileList);
-	} else {
-		list_add(&pCifsFile->flist,
-			 &pCifsInode->openFileList);
-	}
-
 	if (pCifsInode->clientCanCacheRead) {
 		/* we have the inode open somewhere else
 		   no need to discard cache data */
@@ -284,35 +270,32 @@ int cifs_open(struct inode *inode, struct file *file)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	tcon = cifs_sb->tcon;
 
-	if (file->f_flags & O_CREAT) {
-		/* search inode for this file and fill in file->private_data */
-		pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-		read_lock(&GlobalSMBSeslock);
-		list_for_each(tmp, &pCifsInode->openFileList) {
-			pCifsFile = list_entry(tmp, struct cifsFileInfo,
-					       flist);
-			if ((pCifsFile->pfile == NULL) &&
-			    (pCifsFile->pid == current->tgid)) {
-				/* mode set in cifs_create */
-
-				/* needed for writepage */
-				pCifsFile->pfile = file;
-
-				file->private_data = pCifsFile;
-				break;
-			}
-		}
-		read_unlock(&GlobalSMBSeslock);
-		if (file->private_data != NULL) {
-			rc = 0;
-			FreeXid(xid);
-			return rc;
-		} else {
-			if (file->f_flags & O_EXCL)
-				cERROR(1, ("could not find file instance for "
-					   "new file %p", file));
+	/* search inode for this file and fill in file->private_data */
+	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+	read_lock(&GlobalSMBSeslock);
+	list_for_each(tmp, &pCifsInode->openFileList) {
+		pCifsFile = list_entry(tmp, struct cifsFileInfo,
+				       flist);
+		if ((pCifsFile->pfile == NULL) &&
+		    (pCifsFile->pid == current->tgid)) {
+			/* mode set in cifs_create */
+
+			/* needed for writepage */
+			pCifsFile->pfile = file;
+
+			file->private_data = pCifsFile;
+			break;
 		}
 	}
+	read_unlock(&GlobalSMBSeslock);
+
+	if (file->private_data != NULL) {
+		rc = 0;
+		FreeXid(xid);
+		return rc;
+	} else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
+			cERROR(1, ("could not find file instance for "
+				   "new file %p", file));
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
@@ -500,9 +483,9 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 		return -EBADF;
 
 	xid = GetXid();
-	down(&pCifsFile->fh_sem);
+	mutex_unlock(&pCifsFile->fh_mutex);
 	if (!pCifsFile->invalidHandle) {
-		up(&pCifsFile->fh_sem);
+		mutex_lock(&pCifsFile->fh_mutex);
 		FreeXid(xid);
 		return 0;
 	}
@@ -533,7 +516,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	if (full_path == NULL) {
 		rc = -ENOMEM;
 reopen_error_exit:
-		up(&pCifsFile->fh_sem);
+		mutex_lock(&pCifsFile->fh_mutex);
 		FreeXid(xid);
 		return rc;
 	}
@@ -575,14 +558,14 @@ reopen_error_exit:
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		up(&pCifsFile->fh_sem);
+		mutex_lock(&pCifsFile->fh_mutex);
 		cFYI(1, ("cifs_open returned 0x%x", rc));
 		cFYI(1, ("oplock: %d", oplock));
 	} else {
 reopen_success:
 		pCifsFile->netfid = netfid;
 		pCifsFile->invalidHandle = false;
-		up(&pCifsFile->fh_sem);
+		mutex_lock(&pCifsFile->fh_mutex);
 		pCifsInode = CIFS_I(inode);
 		if (pCifsInode) {
 			if (can_flush) {
@@ -971,6 +954,40 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 	return rc;
 }
 
+/*
+ * Set the timeout on write requests past EOF. For some servers (Windows)
+ * these calls can be very long.
+ *
+ * If we're writing >10M past the EOF we give a 180s timeout. Anything less
+ * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
+ * The 10M cutoff is totally arbitrary. A better scheme for this would be
+ * welcome if someone wants to suggest one.
+ *
+ * We may be able to do a better job with this if there were some way to
+ * declare that a file should be sparse.
+ */
+static int
+cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
+{
+	if (offset <= cifsi->server_eof)
+		return CIFS_STD_OP;
+	else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
+		return CIFS_VLONG_OP;
+	else
+		return CIFS_LONG_OP;
+}
+
+/* update the file size (if needed) after a write */
+static void
+cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+		      unsigned int bytes_written)
+{
+	loff_t end_of_write = offset + bytes_written;
+
+	if (end_of_write > cifsi->server_eof)
+		cifsi->server_eof = end_of_write;
+}
+
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	size_t write_size, loff_t *poffset)
 {
@@ -981,6 +998,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	struct cifsTconInfo *pTcon;
 	int xid, long_op;
 	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
@@ -1000,11 +1018,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	xid = GetXid();
 
-	if (*poffset > file->f_path.dentry->d_inode->i_size)
-		long_op = CIFS_VLONG_OP; /* writes past EOF take long time */
-	else
-		long_op = CIFS_LONG_OP;
-
+	long_op = cifs_write_timeout(cifsi, *poffset);
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
@@ -1048,8 +1062,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 				FreeXid(xid);
 				return rc;
 			}
-		} else
+		} else {
+			cifs_update_eof(cifsi, *poffset, bytes_written);
 			*poffset += bytes_written;
+		}
 		long_op = CIFS_STD_OP; /* subsequent writes fast -
 				    15 seconds is plenty */
 	}
@@ -1085,6 +1101,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	struct cifsTconInfo *pTcon;
 	int xid, long_op;
 	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
@@ -1099,11 +1116,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	xid = GetXid();
 
-	if (*poffset > file->f_path.dentry->d_inode->i_size)
-		long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */
-	else
-		long_op = CIFS_LONG_OP;
-
+	long_op = cifs_write_timeout(cifsi, *poffset);
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
@@ -1166,8 +1179,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 				FreeXid(xid);
 				return rc;
 			}
-		} else
+		} else {
+			cifs_update_eof(cifsi, *poffset, bytes_written);
 			*poffset += bytes_written;
+		}
 		long_op = CIFS_STD_OP; /* subsequent writes fast -
 				    15 seconds is plenty */
 	}
@@ -1380,11 +1395,12 @@ static int cifs_writepages(struct address_space *mapping,
 	int nr_pages;
 	__u64 offset = 0;
 	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
 	struct page *page;
 	struct pagevec pvec;
 	int rc = 0;
 	int scanned = 0;
-	int xid;
+	int xid, long_op;
 
 	cifs_sb = CIFS_SB(mapping->host->i_sb);
 
@@ -1528,12 +1544,15 @@ retry:
 				cERROR(1, ("No writable handles for inode"));
 				rc = -EBADF;
 			} else {
+				long_op = cifs_write_timeout(cifsi, offset);
 				rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
 						   open_file->netfid,
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
-						   CIFS_LONG_OP);
+						   long_op);
 				atomic_dec(&open_file->wrtPending);
+				cifs_update_eof(cifsi, offset, bytes_written);
+
 				if (rc || bytes_written < bytes_to_write) {
 					cERROR(1, ("Write2 ret %d, wrote %d",
 						  rc, bytes_written));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f121a80fdd6f..9c869a6dcba1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -143,6 +143,7 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 
 	inode->i_nlink = le64_to_cpu(info->Nlinks);
 
+	cifsInfo->server_eof = end_of_file;
 	spin_lock(&inode->i_lock);
 	if (is_size_safe_to_change(cifsInfo, end_of_file)) {
 		/*
@@ -276,7 +277,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = cifs_new_inode(sb, &find_data.UniqueId);
+		__u64 unique_id = le64_to_cpu(find_data.UniqueId);
+		*pinode = cifs_new_inode(sb, &unique_id);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgiiu_exit;
@@ -605,12 +607,12 @@ int cifs_get_inode_info(struct inode **pinode,
 			inode->i_mode |= S_IFREG;
 	}
 
+	cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile);
 	spin_lock(&inode->i_lock);
-	if (is_size_safe_to_change(cifsInfo,
-				   le64_to_cpu(pfindData->EndOfFile))) {
+	if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) {
 		/* can not safely shrink the file size here if the
 		   client is writing to it due to potential races */
-		i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
+		i_size_write(inode, cifsInfo->server_eof);
 
 		/* 512 bytes (2**9) is the fake blocksize that must be
 		   used for this calculation */
@@ -960,13 +962,21 @@ undo_setattr:
 	goto out_close;
 }
 
+
+/*
+ * If dentry->d_inode is null (usually meaning the cached dentry
+ * is a negative dentry) then we would attempt a standard SMB delete, but
+ * if that fails we can not attempt the fall back mechanisms on EACESS
+ * but will return the EACESS to the caller.  Note that the VFS does not call
+ * unlink on negative dentries currently.
+ */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int rc = 0;
 	int xid;
 	char *full_path = NULL;
 	struct inode *inode = dentry->d_inode;
-	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifsInodeInfo *cifs_inode;
 	struct super_block *sb = dir->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *tcon = cifs_sb->tcon;
@@ -1010,7 +1020,7 @@ psx_del_no_retry:
 		rc = cifs_rename_pending_delete(full_path, dentry, xid);
 		if (rc == 0)
 			drop_nlink(inode);
-	} else if (rc == -EACCES && dosattr == 0) {
+	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
 		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
 		if (attrs == NULL) {
 			rc = -ENOMEM;
@@ -1018,7 +1028,8 @@ psx_del_no_retry:
 		}
 
 		/* try to reset dos attributes */
-		origattr = cifsInode->cifsAttrs;
+		cifs_inode = CIFS_I(inode);
+		origattr = cifs_inode->cifsAttrs;
 		if (origattr == 0)
 			origattr |= ATTR_NORMAL;
 		dosattr = origattr & ~ATTR_READONLY;
@@ -1039,13 +1050,13 @@ psx_del_no_retry:
 
 out_reval:
 	if (inode) {
-		cifsInode = CIFS_I(inode);
-		cifsInode->time = 0;	/* will force revalidate to get info
+		cifs_inode = CIFS_I(inode);
+		cifs_inode->time = 0;	/* will force revalidate to get info
 					   when needed */
 		inode->i_ctime = current_fs_time(sb);
 	}
 	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
-	cifsInode = CIFS_I(dir);
+	cifs_inode = CIFS_I(dir);
 	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
 
 	kfree(full_path);
@@ -1138,6 +1149,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			cFYI(1, ("posix mkdir returned 0x%x", rc));
 			d_drop(direntry);
 		} else {
+			__u64 unique_id;
 			if (pInfo->Type == cpu_to_le32(-1)) {
 				/* no return info, go query for it */
 				kfree(pInfo);
@@ -1151,8 +1163,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			else
 				direntry->d_op = &cifs_dentry_ops;
 
-			newinode = cifs_new_inode(inode->i_sb,
-						  &pInfo->UniqueId);
+			unique_id = le64_to_cpu(pInfo->UniqueId);
+			newinode = cifs_new_inode(inode->i_sb, &unique_id);
 			if (newinode == NULL) {
 				kfree(pInfo);
 				goto mkdir_get_info;
@@ -1450,7 +1462,8 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 		     checking the UniqueId via FILE_INTERNAL_INFO */
 
 unlink_target:
-	if ((rc == -EACCES) || (rc == -EEXIST)) {
+	/* Try unlinking the target dentry if it's not negative */
+	if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
 		tmprc = cifs_unlink(target_dir, target_dentry);
 		if (tmprc)
 			goto cifs_rename_exit;
@@ -1753,6 +1766,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	}
 
 	if (rc == 0) {
+		cifsInode->server_eof = attrs->ia_size;
 		rc = cifs_vmtruncate(inode, attrs->ia_size);
 		cifs_truncate_page(inode->i_mapping, inode->i_size);
 	}
@@ -1792,20 +1806,21 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		goto out;
 	}
 
-	if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
-		/*
-		   Flush data before changing file size or changing the last
-		   write time of the file on the server. If the
-		   flush returns error, store it to report later and continue.
-		   BB: This should be smarter. Why bother flushing pages that
-		   will be truncated anyway? Also, should we error out here if
-		   the flush returns error?
-		 */
-		rc = filemap_write_and_wait(inode->i_mapping);
-		if (rc != 0) {
-			cifsInode->write_behind_rc = rc;
-			rc = 0;
-		}
+	/*
+	 * Attempt to flush data before changing attributes. We need to do
+	 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
+	 * ownership or mode then we may also need to do this. Here, we take
+	 * the safe way out and just do the flush on all setattr requests. If
+	 * the flush returns error, store it to report later and continue.
+	 *
+	 * BB: This should be smarter. Why bother flushing pages that
+	 * will be truncated anyway? Also, should we error out here if
+	 * the flush returns error?
+	 */
+	rc = filemap_write_and_wait(inode->i_mapping);
+	if (rc != 0) {
+		cifsInode->write_behind_rc = rc;
+		rc = 0;
 	}
 
 	if (attrs->ia_valid & ATTR_SIZE) {
@@ -1903,20 +1918,21 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 		return -ENOMEM;
 	}
 
-	if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
-		/*
-		   Flush data before changing file size or changing the last
-		   write time of the file on the server. If the
-		   flush returns error, store it to report later and continue.
-		   BB: This should be smarter. Why bother flushing pages that
-		   will be truncated anyway? Also, should we error out here if
-		   the flush returns error?
-		 */
-		rc = filemap_write_and_wait(inode->i_mapping);
-		if (rc != 0) {
-			cifsInode->write_behind_rc = rc;
-			rc = 0;
-		}
+	/*
+	 * Attempt to flush data before changing attributes. We need to do
+	 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
+	 * ownership or mode then we may also need to do this. Here, we take
+	 * the safe way out and just do the flush on all setattr requests. If
+	 * the flush returns error, store it to report later and continue.
+	 *
+	 * BB: This should be smarter. Why bother flushing pages that
+	 * will be truncated anyway? Also, should we error out here if
+	 * the flush returns error?
+	 */
+	rc = filemap_write_and_wait(inode->i_mapping);
+	if (rc != 0) {
+		cifsInode->write_behind_rc = rc;
+		rc = 0;
 	}
 
 	if (attrs->ia_valid & ATTR_SIZE) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 63f644000ce5..ea9d11e3dcbb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -119,16 +119,11 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	full_path = build_path_from_dentry(direntry);
 
 	if (!full_path)
-		goto out_no_free;
+		goto out;
 
 	cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
-	target_path = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!target_path) {
-		target_path = ERR_PTR(-ENOMEM);
-		goto out;
-	}
 
 	/* We could change this to:
 		if (pTcon->unix_ext)
@@ -138,8 +133,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	if (pTcon->ses->capabilities & CAP_UNIX)
 		rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
-					     target_path,
-					     PATH_MAX-1,
+					     &target_path,
 					     cifs_sb->local_nls);
 	else {
 		/* BB add read reparse point symlink code here */
@@ -148,22 +142,16 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 		/* BB Add MAC style xsymlink check here if enabled */
 	}
 
-	if (rc == 0) {
-
-/* BB Add special case check for Samba DFS symlinks */
-
-		target_path[PATH_MAX-1] = 0;
-	} else {
+	if (rc != 0) {
 		kfree(target_path);
 		target_path = ERR_PTR(rc);
 	}
 
-out:
 	kfree(full_path);
-out_no_free:
+out:
 	FreeXid(xid);
 	nd_set_link(nd, target_path);
-	return NULL;	/* No cookie */
+	return NULL;
 }
 
 int
@@ -224,98 +212,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 	return rc;
 }
 
-int
-cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
-{
-	struct inode *inode = direntry->d_inode;
-	int rc = -EACCES;
-	int xid;
-	int oplock = 0;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	char *full_path = NULL;
-	char *tmpbuffer;
-	int len;
-	__u16 fid;
-
-	xid = GetXid();
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
-
-/* BB would it be safe against deadlock to grab this sem
-      even though rename itself grabs the sem and calls lookup? */
-/*       mutex_lock(&inode->i_sb->s_vfs_rename_mutex);*/
-	full_path = build_path_from_dentry(direntry);
-/*       mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);*/
-
-	if (full_path == NULL) {
-		FreeXid(xid);
-		return -ENOMEM;
-	}
-
-	cFYI(1,
-	     ("Full path: %s inode = 0x%p pBuffer = 0x%p buflen = %d",
-	      full_path, inode, pBuffer, buflen));
-	if (buflen > PATH_MAX)
-		len = PATH_MAX;
-	else
-		len = buflen;
-	tmpbuffer = kmalloc(len, GFP_KERNEL);
-	if (tmpbuffer == NULL) {
-		kfree(full_path);
-		FreeXid(xid);
-		return -ENOMEM;
-	}
-
-/* BB add read reparse point symlink code and
-	Unix extensions symlink code here BB */
-/* We could disable this based on pTcon->unix_ext flag instead ... but why? */
-	if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
-		rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
-				tmpbuffer,
-				len - 1,
-				cifs_sb->local_nls);
-	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-		cERROR(1, ("SFU style symlinks not implemented yet"));
-		/* add open and read as in fs/cifs/inode.c */
-	} else {
-		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
-				OPEN_REPARSE_POINT, &fid, &oplock, NULL,
-				cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (!rc) {
-			rc = CIFSSMBQueryReparseLinkInfo(xid, pTcon, full_path,
-				tmpbuffer,
-				len - 1,
-				fid,
-				cifs_sb->local_nls);
-			if (CIFSSMBClose(xid, pTcon, fid)) {
-				cFYI(1, ("Error closing junction point "
-					 "(open for ioctl)"));
-			}
-			/* If it is a DFS junction earlier we would have gotten
-			   PATH_NOT_COVERED returned from server so we do
-			   not need to request the DFS info here */
-		}
-	}
-	/* BB Anything else to do to handle recursive links? */
-	/* BB Should we be using page ops here? */
-
-	/* BB null terminate returned string in pBuffer? BB */
-	if (rc == 0) {
-		rc = vfs_readlink(direntry, pBuffer, len, tmpbuffer);
-		cFYI(1,
-		     ("vfs_readlink called from cifs_readlink returned %d",
-		      rc));
-	}
-
-	kfree(tmpbuffer);
-	kfree(full_path);
-	FreeXid(xid);
-	return rc;
-}
-
 void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
 {
 	char *p = nd_get_link(nd);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4c89c572891a..e079a9190ec4 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -635,77 +635,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
 	return;
 }
 
-/* Windows maps these to the user defined 16 bit Unicode range since they are
-   reserved symbols (along with \ and /), otherwise illegal to store
-   in filenames in NTFS */
-#define UNI_ASTERIK     (__u16) ('*' + 0xF000)
-#define UNI_QUESTION    (__u16) ('?' + 0xF000)
-#define UNI_COLON       (__u16) (':' + 0xF000)
-#define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
-#define UNI_LESSTHAN    (__u16) ('<' + 0xF000)
-#define UNI_PIPE        (__u16) ('|' + 0xF000)
-#define UNI_SLASH       (__u16) ('\\' + 0xF000)
-
-/* Convert 16 bit Unicode pathname from wire format to string in current code
-   page.  Conversion may involve remapping up the seven characters that are
-   only legal in POSIX-like OS (if they are present in the string). Path
-   names are little endian 16 bit Unicode on the wire */
-int
-cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
-		    const struct nls_table *cp)
-{
-	int i, j, len;
-	__u16 src_char;
-
-	for (i = 0, j = 0; i < maxlen; i++) {
-		src_char = le16_to_cpu(source[i]);
-		switch (src_char) {
-			case 0:
-				goto cUCS_out; /* BB check this BB */
-			case UNI_COLON:
-				target[j] = ':';
-				break;
-			case UNI_ASTERIK:
-				target[j] = '*';
-				break;
-			case UNI_QUESTION:
-				target[j] = '?';
-				break;
-			/* BB We can not handle remapping slash until
-			   all the calls to build_path_from_dentry
-			   are modified, as they use slash as separator BB */
-			/* case UNI_SLASH:
-				target[j] = '\\';
-				break;*/
-			case UNI_PIPE:
-				target[j] = '|';
-				break;
-			case UNI_GRTRTHAN:
-				target[j] = '>';
-				break;
-			case UNI_LESSTHAN:
-				target[j] = '<';
-				break;
-			default:
-				len = cp->uni2char(src_char, &target[j],
-						NLS_MAX_CHARSET_SIZE);
-				if (len > 0) {
-					j += len;
-					continue;
-				} else {
-					target[j] = '?';
-				}
-		}
-		j++;
-		/* make sure we do not overrun callers allocated temp buffer */
-		if (j >= (2 * NAME_MAX))
-			break;
-	}
-cUCS_out:
-	target[j] = 0;
-	return j;
-}
-
 /* Convert 16 bit Unicode pathname to wire format from string in current code
    page.  Conversion may involve remapping up the seven characters that are
    only legal in POSIX-like OS (if they are present in the string). Path
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8703d68f5b20..e2fe998989a3 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -79,6 +79,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 	{ErrQuota, -EDQUOT},
 	{ErrNotALink, -ENOLINK},
 	{ERRnetlogonNotStarted, -ENOPROTOOPT},
+	{ERRsymlink, -EOPNOTSUPP},
 	{ErrTooManyLinks, -EMLINK},
 	{0, 0}
 };
@@ -714,6 +715,7 @@ static const struct {
 	ERRDOS, ERRnoaccess, 0xc000028f}, {
 	ERRDOS, ERRnoaccess, 0xc0000290}, {
 	ERRDOS, ERRbadfunc, 0xc000029c}, {
+	ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
 	ERRDOS, ERRinvlevel, 0x007c0001}, };
 
 /*****************************************************************************
diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h
index 588abbb9d08c..257267367d41 100644
--- a/fs/cifs/nterr.h
+++ b/fs/cifs/nterr.h
@@ -35,8 +35,6 @@ struct nt_err_code_struct {
 extern const struct nt_err_code_struct nt_errs[];
 
 /* Win32 Status codes. */
-
-#define STATUS_BUFFER_OVERFLOW            0x80000005
 #define STATUS_MORE_ENTRIES               0x0105
 #define ERROR_INVALID_PARAMETER		  0x0057
 #define ERROR_INSUFFICIENT_BUFFER	  0x007a
@@ -50,6 +48,13 @@ extern const struct nt_err_code_struct nt_errs[];
 #define STATUS_SOME_UNMAPPED       0x0107
 #define STATUS_BUFFER_OVERFLOW     0x80000005
 #define NT_STATUS_NO_MORE_ENTRIES  0x8000001a
+#define NT_STATUS_MEDIA_CHANGED    0x8000001c
+#define NT_STATUS_END_OF_MEDIA     0x8000001e
+#define NT_STATUS_MEDIA_CHECK      0x80000020
+#define NT_STATUS_NO_DATA_DETECTED 0x8000001c
+#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d
+#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288
+#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288
 #define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001
 #define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002
 #define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index c377d8065d99..49c9a4e75319 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -27,29 +27,39 @@
 #define UnknownMessage    cpu_to_le32(8)
 
 /* Negotiate Flags */
-#define NTLMSSP_NEGOTIATE_UNICODE       0x01 /* Text strings are in unicode */
-#define NTLMSSP_NEGOTIATE_OEM           0x02 /* Text strings are in OEM */
-#define NTLMSSP_REQUEST_TARGET          0x04 /* Server return its auth realm */
-#define NTLMSSP_NEGOTIATE_SIGN        0x0010 /* Request signature capability */
-#define NTLMSSP_NEGOTIATE_SEAL        0x0020 /*  Request confidentiality */
-#define NTLMSSP_NEGOTIATE_DGRAM       0x0040
-#define NTLMSSP_NEGOTIATE_LM_KEY      0x0080 /* Sign/seal use LM session key */
-#define NTLMSSP_NEGOTIATE_NTLM        0x0200 /* NTLM authentication */
-#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000
+#define NTLMSSP_NEGOTIATE_UNICODE         0x01 /* Text strings are unicode */
+#define NTLMSSP_NEGOTIATE_OEM             0x02 /* Text strings are in OEM */
+#define NTLMSSP_REQUEST_TARGET            0x04 /* Srv returns its auth realm */
+/* define reserved9                       0x08 */
+#define NTLMSSP_NEGOTIATE_SIGN          0x0010 /* Request signing capability */
+#define NTLMSSP_NEGOTIATE_SEAL          0x0020 /* Request confidentiality */
+#define NTLMSSP_NEGOTIATE_DGRAM         0x0040
+#define NTLMSSP_NEGOTIATE_LM_KEY        0x0080 /* Use LM session key */
+/* defined reserved 8                   0x0100 */
+#define NTLMSSP_NEGOTIATE_NTLM          0x0200 /* NTLM authentication */
+#define NTLMSSP_NEGOTIATE_NT_ONLY       0x0400 /* Lanman not allowed */
+#define NTLMSSP_ANONYMOUS               0x0800
+#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */
 #define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000
-#define NTLMSSP_NEGOTIATE_LOCAL_CALL  0x4000 /* client/server on same machine */
-#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign for all security levels */
-#define NTLMSSP_TARGET_TYPE_DOMAIN   0x10000
-#define NTLMSSP_TARGET_TYPE_SERVER   0x20000
-#define NTLMSSP_TARGET_TYPE_SHARE    0x40000
-#define NTLMSSP_NEGOTIATE_NTLMV2     0x80000
-#define NTLMSSP_REQUEST_INIT_RESP   0x100000
-#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000
-#define NTLMSSP_REQUEST_NOT_NT_KEY  0x400000
+#define NTLMSSP_NEGOTIATE_LOCAL_CALL    0x4000 /* client/server same machine */
+#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN   0x8000 /* Sign. All security levels  */
+#define NTLMSSP_TARGET_TYPE_DOMAIN     0x10000
+#define NTLMSSP_TARGET_TYPE_SERVER     0x20000
+#define NTLMSSP_TARGET_TYPE_SHARE      0x40000
+#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/
+/* #define NTLMSSP_REQUEST_INIT_RESP     0x100000 */
+#define NTLMSSP_NEGOTIATE_IDENTIFY    0x100000
+#define NTLMSSP_REQUEST_ACCEPT_RESP   0x200000 /* reserved5 */
+#define NTLMSSP_REQUEST_NON_NT_KEY    0x400000
 #define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
-#define NTLMSSP_NEGOTIATE_128     0x20000000
-#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
-#define NTLMSSP_NEGOTIATE_56      0x80000000
+/* #define reserved4                 0x1000000 */
+#define NTLMSSP_NEGOTIATE_VERSION    0x2000000 /* we do not set */
+/* #define reserved3                 0x4000000 */
+/* #define reserved2                 0x8000000 */
+/* #define reserved1                0x10000000 */
+#define NTLMSSP_NEGOTIATE_128       0x20000000
+#define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
+#define NTLMSSP_NEGOTIATE_56        0x80000000
 
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
@@ -60,32 +70,36 @@
 typedef struct _SECURITY_BUFFER {
 	__le16 Length;
 	__le16 MaximumLength;
-	__le32 Buffer;		/* offset to buffer */
+	__le32 BufferOffset;	/* offset to buffer */
 } __attribute__((packed)) SECURITY_BUFFER;
 
 typedef struct _NEGOTIATE_MESSAGE {
 	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
-	__le32 MessageType;     /* 1 */
+	__le32 MessageType;     /* NtLmNegotiate = 1 */
 	__le32 NegotiateFlags;
 	SECURITY_BUFFER DomainName;	/* RFC 1001 style and ASCII */
 	SECURITY_BUFFER WorkstationName;	/* RFC 1001 and ASCII */
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
 	char DomainString[0];
 	/* followed by WorkstationString */
 } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
 
 typedef struct _CHALLENGE_MESSAGE {
 	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
-	__le32 MessageType;   /* 2 */
+	__le32 MessageType;   /* NtLmChallenge = 2 */
 	SECURITY_BUFFER TargetName;
 	__le32 NegotiateFlags;
 	__u8 Challenge[CIFS_CRYPTO_KEY_SIZE];
 	__u8 Reserved[8];
 	SECURITY_BUFFER TargetInfoArray;
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
 } __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE;
 
 typedef struct _AUTHENTICATE_MESSAGE {
-	__u8 Signature[sizeof (NTLMSSP_SIGNATURE)];
-	__le32 MessageType;  /* 3 */
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+	__le32 MessageType;  /* NtLmsAuthenticate = 3 */
 	SECURITY_BUFFER LmChallengeResponse;
 	SECURITY_BUFFER NtChallengeResponse;
 	SECURITY_BUFFER DomainName;
@@ -93,5 +107,7 @@ typedef struct _AUTHENTICATE_MESSAGE {
 	SECURITY_BUFFER WorkstationName;
 	SECURITY_BUFFER SessionKey;
 	__le32 NegotiateFlags;
+	/* SECURITY_BUFFER for version info not present since we
+	   do not set the version is present flag */
 	char UserString[0];
 } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c2c01ff4c32c..964e097c8203 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -31,6 +31,13 @@
 #include "cifs_fs_sb.h"
 #include "cifsfs.h"
 
+/*
+ * To be safe - for UCS to UTF-8 with strings loaded with the rare long
+ * characters alloc more to account for such multibyte target UTF-8
+ * characters.
+ */
+#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2)
+
 #ifdef CONFIG_CIFS_DEBUG2
 static void dump_cifs_file_struct(struct file *file, char *label)
 {
@@ -239,6 +246,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	if (atomic_read(&cifsInfo->inUse) == 0)
 		atomic_set(&cifsInfo->inUse, 1);
 
+	cifsInfo->server_eof = end_of_file;
 	spin_lock(&tmp_inode->i_lock);
 	if (is_size_safe_to_change(cifsInfo, end_of_file)) {
 		/* can not safely change the file size here if the
@@ -375,6 +383,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 		tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
 	tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
 
+	cifsInfo->server_eof = end_of_file;
 	spin_lock(&tmp_inode->i_lock);
 	if (is_size_safe_to_change(cifsInfo, end_of_file)) {
 		/* can not safely change the file size here if the
@@ -436,6 +445,38 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 	}
 }
 
+/* BB eventually need to add the following helper function to
+      resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
+      we try to do FindFirst on (NTFS) directory symlinks */
+/*
+int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
+			     int xid)
+{
+	__u16 fid;
+	int len;
+	int oplock = 0;
+	int rc;
+	struct cifsTconInfo *ptcon = cifs_sb->tcon;
+	char *tmpbuffer;
+
+	rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
+			OPEN_REPARSE_POINT, &fid, &oplock, NULL,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (!rc) {
+		tmpbuffer = kmalloc(maxpath);
+		rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
+				tmpbuffer,
+				maxpath -1,
+				fid,
+				cifs_sb->local_nls);
+		if (CIFSSMBClose(xid, ptcon, fid)) {
+			cFYI(1, ("Error closing temporary reparsepoint open)"));
+		}
+	}
+}
+ */
+
 static int initiate_cifs_search(const int xid, struct file *file)
 {
 	int rc = 0;
@@ -491,7 +532,10 @@ ffirst_retry:
 			CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
 	if (rc == 0)
 		cifsFile->invalidHandle = false;
-	if ((rc == -EOPNOTSUPP) &&
+	/* BB add following call to handle readdir on new NTFS symlink errors
+	else if STATUS_STOPPED_ON_SYMLINK
+		call get_symlink_reparse_path and retry with new path */
+	else if ((rc == -EOPNOTSUPP) &&
 		(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
 		goto ffirst_retry;
@@ -820,7 +864,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	char *current_entry, __u16 level, unsigned int unicode,
-	struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
+	struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
 {
 	int rc = 0;
 	unsigned int len = 0;
@@ -840,7 +884,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 			len = strnlen(filename, PATH_MAX);
 		}
 
-		*pinum = pFindData->UniqueId;
+		*pinum = le64_to_cpu(pFindData->UniqueId);
 	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO *pFindData =
 			(FILE_DIRECTORY_INFO *)current_entry;
@@ -856,7 +900,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-		*pinum = pFindData->UniqueId;
+		*pinum = le64_to_cpu(pFindData->UniqueId);
 	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
 		FILE_BOTH_DIRECTORY_INFO *pFindData =
 			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
@@ -879,14 +923,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	}
 
 	if (unicode) {
-		/* BB fixme - test with long names */
-		/* Note converted filename can be longer than in unicode */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
-			pqst->len = cifs_convertUCSpath((char *)pqst->name,
-					(__le16 *)filename, len/2, nlt);
-		else
-			pqst->len = cifs_strfromUCS_le((char *)pqst->name,
-					(__le16 *)filename, len/2, nlt);
+		pqst->len = cifs_from_ucs2((char *) pqst->name,
+					   (__le16 *) filename,
+					   UNICODE_NAME_MAX,
+					   min(len, max_len), nlt,
+					   cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else {
 		pqst->name = filename;
 		pqst->len = len;
@@ -896,8 +938,8 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	return rc;
 }
 
-static int cifs_filldir(char *pfindEntry, struct file *file,
-	filldir_t filldir, void *direntry, char *scratch_buf, int max_len)
+static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
+			void *direntry, char *scratch_buf, unsigned int max_len)
 {
 	int rc = 0;
 	struct qstr qstring;
@@ -994,7 +1036,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	int num_to_fill = 0;
 	char *tmp_buf = NULL;
 	char *end_of_smb;
-	int max_len;
+	unsigned int max_len;
 
 	xid = GetXid();
 
@@ -1068,11 +1110,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 				cifsFile->srch_inf.ntwrk_buf_start);
 		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
 
-		/* To be safe - for UCS to UTF-8 with strings loaded
-		with the rare long characters alloc more to account for
-		such multibyte target UTF-8 characters. cifs_unicode.c,
-		which actually does the conversion, has the same limit */
-		tmp_buf = kmalloc((2 * NAME_MAX) + 4, GFP_KERNEL);
+		tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
 		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
 			if (current_entry == NULL) {
 				/* evaluate whether this case is an error */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5c68b4282be9..897a052270f9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -3,7 +3,7 @@
  *
  *   SMB/CIFS session setup handling routines
  *
- *   Copyright (c) International Business Machines  Corp., 2006, 2007
+ *   Copyright (c) International Business Machines  Corp., 2006, 2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -111,7 +111,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
 get_vc_num_exit:
 	write_unlock(&cifs_tcp_ses_lock);
 
-	return le16_to_cpu(vcnum);
+	return cpu_to_le16(vcnum);
 }
 
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
@@ -277,85 +277,51 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 	*pbcc_area = bcc_ptr;
 }
 
-static int decode_unicode_ssetup(char **pbcc_area, int bleft,
-				 struct cifsSesInfo *ses,
-				 const struct nls_table *nls_cp)
+static void
+decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
+		      const struct nls_table *nls_cp)
 {
-	int rc = 0;
-	int words_left, len;
+	int len;
 	char *data = *pbcc_area;
 
-
-
 	cFYI(1, ("bleft %d", bleft));
 
-
-	/* SMB header is unaligned, so cifs servers word align start of
-	   Unicode strings */
-	data++;
-	bleft--; /* Windows servers do not always double null terminate
-		    their final Unicode string - in which case we
-		    now will not attempt to decode the byte of junk
-		    which follows it */
-
-	words_left = bleft / 2;
-
-	/* save off server operating system */
-	len = UniStrnlen((wchar_t *) data, words_left);
-
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-	if (len >= words_left)
-		return rc;
+	/*
+	 * Windows servers do not always double null terminate their final
+	 * Unicode string. Check to see if there are an uneven number of bytes
+	 * left. If so, then add an extra NULL pad byte to the end of the
+	 * response.
+	 *
+	 * See section 2.7.2 in "Implementing CIFS" for details
+	 */
+	if (bleft % 2) {
+		data[bleft] = 0;
+		++bleft;
+	}
 
 	kfree(ses->serverOS);
-	/* UTF-8 string will not grow more than four times as big as UCS-16 */
-	ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
-	if (ses->serverOS != NULL)
-		cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
-	data += 2 * (len + 1);
-	words_left -= len + 1;
-
-	/* save off server network operating system */
-	len = UniStrnlen((wchar_t *) data, words_left);
-
-	if (len >= words_left)
-		return rc;
+	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	cFYI(1, ("serverOS=%s", ses->serverOS));
+	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
+	data += len;
+	bleft -= len;
+	if (bleft <= 0)
+		return;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
-	if (ses->serverNOS != NULL) {
-		cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
-				   nls_cp);
-		if (strncmp(ses->serverNOS, "NT LAN Manager 4", 16) == 0) {
-			cFYI(1, ("NT4 server"));
-			ses->flags |= CIFS_SES_NT4;
-		}
-	}
-	data += 2 * (len + 1);
-	words_left -= len + 1;
-
-	/* save off server domain */
-	len = UniStrnlen((wchar_t *) data, words_left);
-
-	if (len > words_left)
-		return rc;
+	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	cFYI(1, ("serverNOS=%s", ses->serverNOS));
+	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
+	data += len;
+	bleft -= len;
+	if (bleft <= 0)
+		return;
 
 	kfree(ses->serverDomain);
-	ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */
-	if (ses->serverDomain != NULL) {
-		cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len,
-				   nls_cp);
-		ses->serverDomain[2*len] = 0;
-		ses->serverDomain[(2*len) + 1] = 0;
-	}
-	data += 2 * (len + 1);
-	words_left -= len + 1;
+	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+	cFYI(1, ("serverDomain=%s", ses->serverDomain));
 
-	cFYI(1, ("words left: %d", words_left));
-
-	return rc;
+	return;
 }
 
 static int decode_ascii_ssetup(char **pbcc_area, int bleft,
@@ -412,6 +378,186 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 	return rc;
 }
 
+static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
+				    struct cifsSesInfo *ses)
+{
+	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
+
+	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
+		cERROR(1, ("challenge blob len %d too small", blob_len));
+		return -EINVAL;
+	}
+
+	if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
+		cERROR(1, ("blob signature incorrect %s", pblob->Signature));
+		return -EINVAL;
+	}
+	if (pblob->MessageType != NtLmChallenge) {
+		cERROR(1, ("Incorrect message type %d", pblob->MessageType));
+		return -EINVAL;
+	}
+
+	memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+	/* BB we could decode pblob->NegotiateFlags; some may be useful */
+	/* In particular we can examine sign flags */
+	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
+		we must set the MIC field of the AUTHENTICATE_MESSAGE */
+
+	return 0;
+}
+
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+/* BB Move to ntlmssp.c eventually */
+
+/* We do not malloc the blob, it is passed in pbuffer, because
+   it is fixed size, and small, making this approach cleaner */
+static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
+					 struct cifsSesInfo *ses)
+{
+	NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
+	__u32 flags;
+
+	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+	sec_blob->MessageType = NtLmNegotiate;
+
+	/* BB is NTLMV2 session security format easier to use here? */
+	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
+		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+		NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+	if (ses->server->secMode &
+	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		flags |= NTLMSSP_NEGOTIATE_SIGN;
+	if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+		flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+
+	sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+
+	sec_blob->WorkstationName.BufferOffset = 0;
+	sec_blob->WorkstationName.Length = 0;
+	sec_blob->WorkstationName.MaximumLength = 0;
+
+	/* Domain name is sent on the Challenge not Negotiate NTLMSSP request */
+	sec_blob->DomainName.BufferOffset = 0;
+	sec_blob->DomainName.Length = 0;
+	sec_blob->DomainName.MaximumLength = 0;
+}
+
+/* We do not malloc the blob, it is passed in pbuffer, because its
+   maximum possible size is fixed and small, making this approach cleaner.
+   This function returns the length of the data in the blob */
+static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+				   struct cifsSesInfo *ses,
+				   const struct nls_table *nls_cp, int first)
+{
+	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
+	__u32 flags;
+	unsigned char *tmp;
+	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
+
+	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
+	sec_blob->MessageType = NtLmAuthenticate;
+
+	flags = NTLMSSP_NEGOTIATE_56 |
+		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
+		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
+		NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+	if (ses->server->secMode &
+	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		flags |= NTLMSSP_NEGOTIATE_SIGN;
+	if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+		flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+
+	tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
+	sec_blob->NegotiateFlags |= cpu_to_le32(flags);
+
+	sec_blob->LmChallengeResponse.BufferOffset =
+				cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
+	sec_blob->LmChallengeResponse.Length = 0;
+	sec_blob->LmChallengeResponse.MaximumLength = 0;
+
+	/* calculate session key,  BB what about adding similar ntlmv2 path? */
+	SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
+	if (first)
+		cifs_calculate_mac_key(&ses->server->mac_signing_key,
+				       ntlm_session_key, ses->password);
+
+	memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
+	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
+	sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+	sec_blob->NtChallengeResponse.MaximumLength =
+				cpu_to_le16(CIFS_SESS_KEY_SIZE);
+
+	tmp += CIFS_SESS_KEY_SIZE;
+
+	if (ses->domainName == NULL) {
+		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->DomainName.Length = 0;
+		sec_blob->DomainName.MaximumLength = 0;
+		tmp += 2;
+	} else {
+		int len;
+		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
+				    MAX_USERNAME_SIZE, nls_cp);
+		len *= 2; /* unicode is 2 bytes each */
+		len += 2; /* trailing null */
+		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->DomainName.Length = cpu_to_le16(len);
+		sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
+		tmp += len;
+	}
+
+	if (ses->userName == NULL) {
+		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->UserName.Length = 0;
+		sec_blob->UserName.MaximumLength = 0;
+		tmp += 2;
+	} else {
+		int len;
+		len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
+				    MAX_USERNAME_SIZE, nls_cp);
+		len *= 2; /* unicode is 2 bytes each */
+		len += 2; /* trailing null */
+		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->UserName.Length = cpu_to_le16(len);
+		sec_blob->UserName.MaximumLength = cpu_to_le16(len);
+		tmp += len;
+	}
+
+	sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer);
+	sec_blob->WorkstationName.Length = 0;
+	sec_blob->WorkstationName.MaximumLength = 0;
+	tmp += 2;
+
+	sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+	sec_blob->SessionKey.Length = 0;
+	sec_blob->SessionKey.MaximumLength = 0;
+	return tmp - pbuffer;
+}
+
+
+static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
+				 struct cifsSesInfo *ses)
+{
+	build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
+	pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+
+	return;
+}
+
+static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
+				  struct cifsSesInfo *ses,
+				  const struct nls_table *nls, int first_time)
+{
+	int bloblen;
+
+	bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
+					  first_time);
+	pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
+
+	return bloblen;
+}
+#endif
+
 int
 CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		const struct nls_table *nls_cp)
@@ -430,6 +576,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	__u16 action;
 	int bytes_remaining;
 	struct key *spnego_key = NULL;
+	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 
 	if (ses == NULL)
 		return -EINVAL;
@@ -437,6 +584,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	type = ses->server->secType;
 
 	cFYI(1, ("sess setup type %d", type));
+ssetup_ntlmssp_authenticate:
+	if (phase == NtLmChallenge)
+		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
+
 	if (type == LANMAN) {
 #ifndef CONFIG_CIFS_WEAK_PW_HASH
 		/* LANMAN and plaintext are less secure and off by default.
@@ -650,9 +801,53 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
 	} else {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+		if ((experimEnabled > 1) && (type == RawNTLMSSP)) {
+			if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+				cERROR(1, ("NTLMSSP requires Unicode support"));
+				rc = -ENOSYS;
+				goto ssetup_exit;
+			}
+
+			cFYI(1, ("ntlmssp session setup phase %d", phase));
+			pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+			capabilities |= CAP_EXTENDED_SECURITY;
+			pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+			if (phase == NtLmNegotiate) {
+				setup_ntlmssp_neg_req(pSMB, ses);
+				iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+			} else if (phase == NtLmAuthenticate) {
+				int blob_len;
+				blob_len = setup_ntlmssp_auth_req(pSMB, ses,
+								  nls_cp,
+								  first_time);
+				iov[1].iov_len = blob_len;
+				/* Make sure that we tell the server that we
+				   are using the uid that it just gave us back
+				   on the response (challenge) */
+				smb_buf->Uid = ses->Suid;
+			} else {
+				cERROR(1, ("invalid phase %d", phase));
+				rc = -ENOSYS;
+				goto ssetup_exit;
+			}
+			iov[1].iov_base = &pSMB->req.SecurityBlob[0];
+			/* unicode strings must be word aligned */
+			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+				*bcc_ptr = 0;
+				bcc_ptr++;
+			}
+			unicode_oslm_strings(&bcc_ptr, nls_cp);
+		} else {
+			cERROR(1, ("secType %d not supported!", type));
+			rc = -ENOSYS;
+			goto ssetup_exit;
+		}
+#else
 		cERROR(1, ("secType %d not supported!", type));
 		rc = -ENOSYS;
 		goto ssetup_exit;
+#endif
 	}
 
 	iov[2].iov_base = str_area;
@@ -668,12 +863,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	/* SMB request buf freed in SendReceive2 */
 
 	cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
-	if (rc)
-		goto ssetup_exit;
 
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 	smb_buf = (struct smb_hdr *)iov[0].iov_base;
 
+	if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
+			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
+		if (phase != NtLmNegotiate) {
+			cERROR(1, ("Unexpected more processing error"));
+			goto ssetup_exit;
+		}
+		/* NTLMSSP Negotiate sent now processing challenge (response) */
+		phase = NtLmChallenge; /* process ntlmssp challenge */
+		rc = 0; /* MORE_PROC rc is not an error here, but expected */
+	}
+	if (rc)
+		goto ssetup_exit;
+
 	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
 		rc = -EIO;
 		cERROR(1, ("bad word count %d", smb_buf->WordCount));
@@ -692,22 +898,33 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	if (smb_buf->WordCount == 4) {
 		__u16 blob_len;
 		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
-		bcc_ptr += blob_len;
 		if (blob_len > bytes_remaining) {
 			cERROR(1, ("bad security blob length %d", blob_len));
 			rc = -EINVAL;
 			goto ssetup_exit;
 		}
+		if (phase == NtLmChallenge) {
+			rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+			/* now goto beginning for ntlmssp authenticate phase */
+			if (rc)
+				goto ssetup_exit;
+		}
+		bcc_ptr += blob_len;
 		bytes_remaining -= blob_len;
 	}
 
 	/* BB check if Unicode and decode strings */
-	if (smb_buf->Flags2 & SMBFLG2_UNICODE)
-		rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining,
-						   ses, nls_cp);
-	else
+	if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+		/* unicode string area must be word-aligned */
+		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+			++bcc_ptr;
+			--bytes_remaining;
+		}
+		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+	} else {
 		rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
 					 ses, nls_cp);
+	}
 
 ssetup_exit:
 	if (spnego_key) {
@@ -721,5 +938,9 @@ ssetup_exit:
 	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
 		cifs_buf_release(iov[0].iov_base);
 
+	/* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
+	if ((phase == NtLmChallenge) && (rc == 0))
+		goto ssetup_ntlmssp_authenticate;
+
 	return rc;
 }
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index 7f50e8577c1c..c5084d27db7c 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -110,6 +110,7 @@
 
 /* Below errors are used internally (do not come over the wire) for passthrough
    from STATUS codes to POSIX only  */
+#define ERRsymlink              0xFFFD
 #define ErrTooManyLinks         0xFFFE
 
 /* Following error codes may be generated with the ERRSRV error class.*/
diff --git a/fs/compat.c b/fs/compat.c
index 1c859dae758f..681ed81e6be0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
-	return error;
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 
 asmlinkage long compat_sys_newlstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
-	return error;
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 
 #ifndef __ARCH_WANT_STAT64
@@ -204,21 +206,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
 }
 #endif
 
@@ -1236,7 +1229,7 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
 
 asmlinkage ssize_t
 compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
-		  unsigned long vlen, u32 pos_high, u32 pos_low)
+		  unsigned long vlen, u32 pos_low, u32 pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	struct file *file;
@@ -1293,7 +1286,7 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
 
 asmlinkage ssize_t
 compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
-		   unsigned long vlen, u32 pos_high, u32 pos_low)
+		   unsigned long vlen, u32 pos_low, u32 pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	struct file *file;
@@ -1483,6 +1476,7 @@ int compat_do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
+	bool clear_in_exec;
 	int retval;
 
 	retval = unshare_files(&displaced);
@@ -1505,8 +1499,9 @@ int compat_do_execve(char * filename,
 		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
-	if (retval)
+	if (retval < 0)
 		goto out_unlock;
+	clear_in_exec = retval;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1553,9 +1548,7 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
-	write_lock(&current->fs->lock);
 	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1575,9 +1568,8 @@ out_file:
 	}
 
 out_unmark:
-	write_lock(&current->fs->lock);
-	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
+	if (clear_in_exec)
+		current->fs->in_exec = 0;
 
 out_unlock:
 	current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3e87ce443ea2..b83f6bcfa51a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,7 +58,6 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
-#include <linux/loop.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -68,6 +67,7 @@
 #include <linux/gigaset_dev.h>
 
 #ifdef CONFIG_BLOCK
+#include <linux/loop.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
@@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
@@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
 
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
-
 #ifdef CONFIG_SPARC
 /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
 IGNORE_IOCTL(FBIOGTYPE)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 932a92b31483..c8afa6b1d91d 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -135,7 +135,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	struct path path;
 	struct configfs_dirent *sd;
 	struct config_item *parent_item;
-	struct config_item *target_item;
+	struct config_item *target_item = NULL;
 	struct config_item_type *type;
 
 	ret = -EPERM;  /* What lack-of-symlink returns */
diff --git a/fs/dcache.c b/fs/dcache.c
index 761d30be2683..75659a6fd1f8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -481,7 +481,7 @@ restart:
 			if ((flags & DCACHE_REFERENCED)
 				&& (dentry->d_flags & DCACHE_REFERENCED)) {
 				dentry->d_flags &= ~DCACHE_REFERENCED;
-				list_move_tail(&dentry->d_lru, &referenced);
+				list_move(&dentry->d_lru, &referenced);
 				spin_unlock(&dentry->d_lock);
 			} else {
 				list_move_tail(&dentry->d_lru, &tmp);
@@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 	int result;
 	unsigned long seq;
 
-	/* FIXME: This is old behavior, needed? Please check callers. */
 	if (new_dentry == old_dentry)
 		return 1;
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
 
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
+static bool debugfs_registered;
 
 static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 {
@@ -496,6 +497,16 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 
+/**
+ * debugfs_initialized - Tells whether debugfs has been registered
+ */
+bool debugfs_initialized(void)
+{
+	return debugfs_registered;
+}
+EXPORT_SYMBOL_GPL(debugfs_initialized);
+
+
 static struct kobject *debug_kobj;
 
 static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
 	retval = register_filesystem(&debug_fs_type);
 	if (retval)
 		kobject_put(debug_kobj);
+	else
+		debugfs_registered = true;
+
 	return retval;
 }
 
 static void __exit debugfs_exit(void)
 {
+	debugfs_registered = false;
+
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	unregister_filesystem(&debug_fs_type);
 	kobject_put(debug_kobj);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 63a4a59e4148..c68edb969441 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -90,6 +90,15 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 #define PARSE_MOUNT	0
 #define PARSE_REMOUNT	1
 
+/*
+ * parse_mount_options():
+ * 	Set @opts to mount options specified in @data. If an option is not
+ * 	specified in @data, set it to its default value. The exception is
+ * 	'newinstance' option which can only be set/cleared on a mount (i.e.
+ * 	cannot be changed during remount).
+ *
+ * Note: @data may be NULL (in which case all options are set to default).
+ */
 static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 {
 	char *p;
@@ -355,12 +364,9 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 	struct pts_mount_opts opts;
 	struct super_block *s;
 
-	memset(&opts, 0, sizeof(opts));
-	if (data) {
-		error = parse_mount_options(data, PARSE_MOUNT, &opts);
-		if (error)
-			return error;
-	}
+	error = parse_mount_options(data, PARSE_MOUNT, &opts);
+	if (error)
+		return error;
 
 	if (opts.newinstance)
 		s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -389,11 +395,10 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 	return 0;
 
 out_dput:
-	dput(s->s_root);
+	dput(s->s_root); /* undo dget() in simple_set_mnt() */
 
 out_undo_sget:
-	up_write(&s->s_umount);
-	deactivate_super(s);
+	deactivate_locked_super(s);
 	return error;
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..05763bbc2050 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -307,8 +307,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 	struct bio *bio;
 
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
-	if (bio == NULL)
-		return -ENOMEM;
 
 	bio->bi_bdev = bdev;
 	bio->bi_sector = first_sector;
@@ -1126,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_SYNC;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 8b65f289ee00..b91851f1cda3 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page)
 	ecryptfs_inode = page->mapping->host;
 	crypt_stat =
 		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page,
-						       0, PAGE_CACHE_SIZE);
-		if (rc)
-			printk(KERN_ERR "%s: Error attempting to copy "
-			       "page at index [%ld]\n", __func__,
-			       page->index);
-		goto out;
-	}
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 	enc_extent_page = alloc_page(GFP_USER);
 	if (!enc_extent_page) {
 		rc = -ENOMEM;
@@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page)
 	ecryptfs_inode = page->mapping->host;
 	crypt_stat =
 		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-		rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
-						      PAGE_CACHE_SIZE,
-						      ecryptfs_inode);
-		if (rc)
-			printk(KERN_ERR "%s: Error attempting to copy "
-			       "page at index [%ld]\n", __func__,
-			       page->index);
-		goto out;
-	}
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 	enc_extent_page = alloc_page(GFP_USER);
 	if (!enc_extent_page) {
 		rc = -ENOMEM;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 064c5820e4e5..00b30a2d5466 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat {
 #define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
 #define ECRYPTFS_ENCFN_USE_FEK        0x00002000
+#define ECRYPTFS_UNLINK_SIGS	      0x00004000
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 55b3145b8072..2f0945d63297 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		goto out_d_drop;
 	}
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
 	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
 				      lower_dir_dentry,
 				      ecryptfs_dentry->d_name.len);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out_d_drop;
 	}
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
 	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
 				      lower_dir_dentry,
 				      encrypted_and_encoded_name_size - 1);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -636,8 +640,9 @@ static int
 ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
 	char *lower_buf;
+	size_t lower_bufsiz;
 	struct dentry *lower_dentry;
-	struct ecryptfs_crypt_stat *crypt_stat;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
 	char *plaintext_name;
 	size_t plaintext_name_size;
 	mm_segment_t old_fs;
@@ -648,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 		rc = -EINVAL;
 		goto out;
 	}
-	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+						dentry->d_sb)->mount_crypt_stat;
+	/*
+	 * If the lower filename is encrypted, it will result in a significantly
+	 * longer name.  If needed, truncate the name after decode and decrypt.
+	 */
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+		lower_bufsiz = PATH_MAX;
+	else
+		lower_bufsiz = bufsiz;
 	/* Released in this function */
-	lower_buf = kmalloc(bufsiz, GFP_KERNEL);
+	lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
 	if (lower_buf == NULL) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%d] bytes\n", __func__, bufsiz);
+		       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -661,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 	set_fs(get_ds());
 	rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
 						   (char __user *)lower_buf,
-						   bufsiz);
+						   lower_bufsiz);
 	set_fs(old_fs);
 	if (rc >= 0) {
 		rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
@@ -674,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 				rc);
 			goto out_free_lower_buf;
 		}
-		rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
+		/* Check for bufsiz <= 0 done in sys_readlinkat() */
+		rc = copy_to_user(buf, plaintext_name,
+				  min((size_t) bufsiz, plaintext_name_size));
 		if (rc)
 			rc = -EFAULT;
 		else
@@ -814,6 +830,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 		size_t num_zeros = (PAGE_CACHE_SIZE
 				    - (new_length & ~PAGE_CACHE_MASK));
 
+		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+			rc = vmtruncate(inode, new_length);
+			if (rc)
+				goto out_free;
+			rc = vmtruncate(lower_dentry->d_inode, new_length);
+			goto out_free;
+		}
 		if (num_zeros) {
 			char *zeros_virt;
 
@@ -915,8 +938,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 			}
 			rc = 0;
 			crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-			mutex_unlock(&crypt_stat->cs_mutex);
-			goto out;
 		}
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index aed56c25539b..9f0aa9883c28 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 		init_special_inode(inode, lower_inode->i_mode,
 				   lower_inode->i_rdev);
 	dentry->d_op = &ecryptfs_dops;
-	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
-		d_add(dentry, inode);
-	else
-		d_instantiate(dentry, inode);
 	fsstack_copy_attr_all(inode, lower_inode, NULL);
 	/* This size will be overwritten for real files w/ headers and
 	 * other metadata */
 	fsstack_copy_inode_size(inode, lower_inode);
+	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
+		d_add(dentry, inode);
+	else
+		d_instantiate(dentry, inode);
 out:
 	return rc;
 }
@@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
        ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
        ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
 
 static const match_table_t tokens = {
 	{ecryptfs_opt_sig, "sig=%s"},
@@ -222,6 +222,7 @@ static const match_table_t tokens = {
 	{ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
 	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
 	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
+	{ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
 	{ecryptfs_opt_err, NULL}
 };
 
@@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 				fn_cipher_key_bytes;
 			fn_cipher_key_bytes_set = 1;
 			break;
+		case ecryptfs_opt_unlink_sigs:
+			mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
+			break;
 		case ecryptfs_opt_err:
 		default:
 			printk(KERN_WARNING
@@ -610,9 +614,8 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 	goto out;
 out_abort:
-	dput(sb->s_root);
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
+	dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
+	deactivate_locked_super(sb);
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 295e7fa56755..f1c17e87c5fb 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -133,45 +133,6 @@ out:
 	return rc;
 }
 
-static int
-ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
-			     struct ecryptfs_msg_ctx **msg_ctx);
-
-/**
- * ecryptfs_send_raw_message
- * @msg_type: Message type
- * @daemon: Daemon struct for recipient of message
- *
- * A raw message is one that does not include an ecryptfs_message
- * struct. It simply has a type.
- *
- * Must be called with ecryptfs_daemon_hash_mux held.
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_send_raw_message(u8 msg_type,
-				     struct ecryptfs_daemon *daemon)
-{
-	struct ecryptfs_msg_ctx *msg_ctx;
-	int rc;
-
-	rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
-	if (rc) {
-		printk(KERN_ERR "%s: Error whilst attempting to send "
-		       "message to ecryptfsd; rc = [%d]\n", __func__, rc);
-		goto out;
-	}
-	/* Raw messages are logically context-free (e.g., no
-	 * reply is expected), so we set the state of the
-	 * ecryptfs_msg_ctx object to indicate that it should
-	 * be freed as soon as the message is sent. */
-	mutex_lock(&msg_ctx->mux);
-	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
-	mutex_unlock(&msg_ctx->mux);
-out:
-	return rc;
-}
-
 /**
  * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
  * @daemon: Pointer to set to newly allocated daemon struct
@@ -212,49 +173,6 @@ out:
 }
 
 /**
- * ecryptfs_process_helo
- * @euid: The user ID owner of the message
- * @user_ns: The namespace in which @euid applies
- * @pid: The process ID for the userspace program that sent the
- *       message
- *
- * Adds the euid and pid values to the daemon euid hash.  If an euid
- * already has a daemon pid registered, the daemon will be
- * unregistered before the new daemon is put into the hash list.
- * Returns zero after adding a new daemon to the hash list;
- * non-zero otherwise.
- */
-int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
-			  struct pid *pid)
-{
-	struct ecryptfs_daemon *new_daemon;
-	struct ecryptfs_daemon *old_daemon;
-	int rc;
-
-	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
-	if (rc != 0) {
-		printk(KERN_WARNING "Received request from user [%d] "
-		       "to register daemon [0x%p]; unregistering daemon "
-		       "[0x%p]\n", euid, pid, old_daemon->pid);
-		rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
-		if (rc)
-			printk(KERN_WARNING "Failed to send QUIT "
-			       "message to daemon [0x%p]; rc = [%d]\n",
-			       old_daemon->pid, rc);
-		hlist_del(&old_daemon->euid_chain);
-		kfree(old_daemon);
-	}
-	rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
-	if (rc)
-		printk(KERN_ERR "%s: The gods are displeased with this attempt "
-		       "to create a new daemon object for euid [%d]; pid "
-		       "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
-	mutex_unlock(&ecryptfs_daemon_hash_mux);
-	return rc;
-}
-
-/**
  * ecryptfs_exorcise_daemon - Destroy the daemon struct
  *
  * Must be called ceremoniously while in possession of
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a67fea655f49..4ec8f61ccf5a 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 	int rc = 0;
 
 	mutex_lock(&msg_ctx->mux);
-	if (data) {
-		msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
-				       GFP_KERNEL);
-		if (!msg_ctx->msg) {
-			rc = -ENOMEM;
-			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
-			       (sizeof(*msg_ctx->msg) + data_size));
-			goto out_unlock;
-		}
-	} else
-		msg_ctx->msg = NULL;
+	msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
+			       GFP_KERNEL);
+	if (!msg_ctx->msg) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
+		       (sizeof(*msg_ctx->msg) + data_size));
+		goto out_unlock;
+	}
 	msg_ctx->msg->index = msg_ctx->index;
 	msg_ctx->msg->data_len = data_size;
 	msg_ctx->type = msg_type;
-	if (data) {
-		memcpy(msg_ctx->msg->data, data, data_size);
-		msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
-	} else
-		msg_ctx->msg_size = 0;
+	memcpy(msg_ctx->msg->data, data, data_size);
+	msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
 	mutex_lock(&daemon->mux);
 	list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
 	daemon->num_queued_msg_ctx++;
@@ -418,18 +412,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 
 	if (count == 0)
 		goto out;
-	data = kmalloc(count, GFP_KERNEL);
-	if (!data) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
+
+	data = memdup_user(buf, count);
+	if (IS_ERR(data)) {
+		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
+		       __func__, PTR_ERR(data));
 		goto out;
 	}
-	rc = copy_from_user(data, buf, count);
-	if (rc) {
-		printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
-		       __func__, rc);
-		goto out_free;
-	}
 	sz = count;
 	i = 0;
 	switch (data[i++]) {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 46cec2b69796..5c6bab9786e3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
 	struct ecryptfs_crypt_stat *crypt_stat;
 
 	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
 		return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
 	else
@@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file,
 		ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
 	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
 			"(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+	if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
+						       to);
+		if (!rc) {
+			rc = copied;
+			fsstack_copy_inode_size(ecryptfs_inode,
+				ecryptfs_inode_to_lower(ecryptfs_inode));
+		}
+		goto out;
+	}
 	/* Fills in zeros if 'to' goes beyond inode size */
 	rc = fill_zeros_to_end_of_page(page, to);
 	if (rc) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 75c2ea9fee35..a137c6ea2fee 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		   size_t size)
 {
 	struct page *ecryptfs_page;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
 	char *ecryptfs_page_virt;
-	loff_t ecryptfs_file_size =
-		i_size_read(ecryptfs_file->f_dentry->d_inode);
+	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
 	loff_t data_offset = 0;
 	loff_t pos;
 	int rc = 0;
 
+	crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
 	/*
 	 * if we are writing beyond current size, then start pos
 	 * at the current size - we'll fill in zeros from there.
@@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		flush_dcache_page(ecryptfs_page);
 		SetPageUptodate(ecryptfs_page);
 		unlock_page(ecryptfs_page);
-		rc = ecryptfs_encrypt_page(ecryptfs_page);
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
+			rc = ecryptfs_encrypt_page(ecryptfs_page);
+		else
+			rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
+						ecryptfs_page,
+						start_offset_in_page,
+						data_offset);
 		page_cache_release(ecryptfs_page);
 		if (rc) {
 			printk(KERN_ERR "%s: Error encrypting "
@@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		pos += num_bytes;
 	}
 	if ((offset + size) > ecryptfs_file_size) {
-		i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size));
-		rc = ecryptfs_write_inode_size_to_metadata(
-			ecryptfs_file->f_dentry->d_inode);
-		if (rc) {
-			printk(KERN_ERR	"Problem with "
-			       "ecryptfs_write_inode_size_to_metadata; "
-			       "rc = [%d]\n", rc);
-			goto out;
+		i_size_write(ecryptfs_inode, (offset + size));
+		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
+			rc = ecryptfs_write_inode_size_to_metadata(
+								ecryptfs_inode);
+			if (rc) {
+				printk(KERN_ERR	"Problem with "
+				       "ecryptfs_write_inode_size_to_metadata; "
+				       "rc = [%d]\n", rc);
+				goto out;
+			}
 		}
 	}
 out:
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index c27ac2b358a1..fa4c7e7d15d9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 	list_for_each_entry(walker,
 			    &mount_crypt_stat->global_auth_tok_list,
 			    mount_crypt_stat_list) {
-		seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
+		if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
+		else
+			seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
 	}
 	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
 
@@ -186,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_printf(m, ",ecryptfs_xattr_metadata");
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
 		seq_printf(m, ",ecryptfs_encrypted_view");
+	if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
+		seq_printf(m, ",ecryptfs_unlink_sigs");
 
 	return 0;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a89f370fadb5..5458e80fc558 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1212,7 +1212,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 
 SYSCALL_DEFINE1(epoll_create, int, size)
 {
-	if (size < 0)
+	if (size <= 0)
 		return -EINVAL;
 
 	return sys_epoll_create1(0);
diff --git a/fs/exec.c b/fs/exec.c
index 052a961e41aa..895823d0149d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -69,17 +69,18 @@ int suid_dumpable = 0;
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
-int register_binfmt(struct linux_binfmt * fmt)
+int __register_binfmt(struct linux_binfmt * fmt, int insert)
 {
 	if (!fmt)
 		return -EINVAL;
 	write_lock(&binfmt_lock);
-	list_add(&fmt->lh, &formats);
+	insert ? list_add(&fmt->lh, &formats) :
+		 list_add_tail(&fmt->lh, &formats);
 	write_unlock(&binfmt_lock);
 	return 0;	
 }
 
-EXPORT_SYMBOL(register_binfmt);
+EXPORT_SYMBOL(__register_binfmt);
 
 void unregister_binfmt(struct linux_binfmt * fmt)
 {
@@ -104,40 +105,28 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
 	struct file *file;
-	struct nameidata nd;
 	char *tmp = getname(library);
 	int error = PTR_ERR(tmp);
 
-	if (!IS_ERR(tmp)) {
-		error = path_lookup_open(AT_FDCWD, tmp,
-					 LOOKUP_FOLLOW, &nd,
-					 FMODE_READ|FMODE_EXEC);
-		putname(tmp);
-	}
-	if (error)
+	if (IS_ERR(tmp))
+		goto out;
+
+	file = do_filp_open(AT_FDCWD, tmp,
+				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				MAY_READ | MAY_EXEC | MAY_OPEN);
+	putname(tmp);
+	error = PTR_ERR(file);
+	if (IS_ERR(file))
 		goto out;
 
 	error = -EINVAL;
-	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
 		goto exit;
 
 	error = -EACCES;
-	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	error = inode_permission(nd.path.dentry->d_inode,
-				 MAY_READ | MAY_EXEC | MAY_OPEN);
-	if (error)
-		goto exit;
-	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
-	if (error)
-		goto exit;
-
-	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
-	error = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-
 	fsnotify_open(file->f_path.dentry);
 
 	error = -ENOEXEC;
@@ -159,13 +148,10 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 		}
 		read_unlock(&binfmt_lock);
 	}
+exit:
 	fput(file);
 out:
   	return error;
-exit:
-	release_open_intent(&nd);
-	path_put(&nd.path);
-	goto out;
 }
 
 #ifdef CONFIG_MMU
@@ -660,47 +646,33 @@ EXPORT_SYMBOL(setup_arg_pages);
 
 struct file *open_exec(const char *name)
 {
-	struct nameidata nd;
 	struct file *file;
 	int err;
 
-	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
-				FMODE_READ|FMODE_EXEC);
-	if (err)
+	file = do_filp_open(AT_FDCWD, name,
+				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				MAY_EXEC | MAY_OPEN);
+	if (IS_ERR(file))
 		goto out;
 
 	err = -EACCES;
-	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
-		goto out_path_put;
-
-	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
-		goto out_path_put;
-
-	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
-	if (err)
-		goto out_path_put;
-	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
-	if (err)
-		goto out_path_put;
+	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+		goto exit;
 
-	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
-	if (IS_ERR(file))
-		return file;
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
 
 	fsnotify_open(file->f_path.dentry);
 
 	err = deny_write_access(file);
-	if (err) {
-		fput(file);
-		goto out;
-	}
+	if (err)
+		goto exit;
 
+out:
 	return file;
 
- out_path_put:
-	release_open_intent(&nd);
-	path_put(&nd.path);
- out:
+exit:
+	fput(file);
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(open_exec);
@@ -1060,7 +1032,6 @@ EXPORT_SYMBOL(install_exec_creds);
 int check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
-	unsigned long flags;
 	unsigned n_fs;
 	int res = 0;
 
@@ -1068,21 +1039,22 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 
 	n_fs = 1;
 	write_lock(&p->fs->lock);
-	lock_task_sighand(p, &flags);
+	rcu_read_lock();
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
 			n_fs++;
 	}
+	rcu_read_unlock();
 
 	if (p->fs->users > n_fs) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
 	} else {
-		if (p->fs->in_exec)
-			res = -EAGAIN;
-		p->fs->in_exec = 1;
+		res = -EAGAIN;
+		if (!p->fs->in_exec) {
+			p->fs->in_exec = 1;
+			res = 1;
+		}
 	}
-
-	unlock_task_sighand(p, &flags);
 	write_unlock(&p->fs->lock);
 
 	return res;
@@ -1284,6 +1256,7 @@ int do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
+	bool clear_in_exec;
 	int retval;
 
 	retval = unshare_files(&displaced);
@@ -1306,8 +1279,9 @@ int do_execve(char * filename,
 		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
-	if (retval)
+	if (retval < 0)
 		goto out_unlock;
+	clear_in_exec = retval;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1355,9 +1329,7 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
-	write_lock(&current->fs->lock);
 	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1377,9 +1349,8 @@ out_file:
 	}
 
 out_unmark:
-	write_lock(&current->fs->lock);
-	current->fs->in_exec = 0;
-	write_unlock(&current->fs->lock);
+	if (clear_in_exec)
+		current->fs->in_exec = 0;
 
 out_unlock:
 	current->in_execve = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b95563663..acf678831103 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
 
 	if (depth == 0)
 		return (err);
-reread:
-	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 
+	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext2_fsblk_t blk;
 
-			if (!verify_chain(chain, partial)) {
+			if (!verify_chain(chain, chain + depth - 1)) {
 				/*
 				 * Indirect block might be removed by
 				 * truncate while we were reading it.
 				 * Handling of that case: forget what we've
 				 * got now, go to reread.
 				 */
+				err = -EAGAIN;
 				count = 0;
-				goto changed;
+				break;
 			}
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 			if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
 			else
 				break;
 		}
-		goto got_it;
+		if (err != -EAGAIN)
+			goto got_it;
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
 		goto cleanup;
 
 	mutex_lock(&ei->truncate_mutex);
+	/*
+	 * If the indirect block is missing while we are reading
+	 * the chain(ext3_get_branch() returns -EAGAIN err), or
+	 * if the chain has been changed after we grab the semaphore,
+	 * (either because another process truncated this branch, or
+	 * another get_block allocated this branch) re-grab the chain to see if
+	 * the request block has been allocated or not.
+	 *
+	 * Since we already block the truncate/other get_block
+	 * at this point, we will have the current copy of the chain when we
+	 * splice the branch into the tree.
+	 */
+	if (err == -EAGAIN || !verify_chain(chain, partial)) {
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+		if (!partial) {
+			count++;
+			mutex_unlock(&ei->truncate_mutex);
+			if (err)
+				goto cleanup;
+			clear_buffer_new(bh_result);
+			goto got_it;
+		}
+	}
 
 	/*
 	 * Okay, we need to do block allocation.  Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
 		partial--;
 	}
 	return err;
-changed:
-	while (partial > chain) {
-		brelse(partial->bh);
-		partial--;
-	}
-	goto reread;
 }
 
 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f983225266dc..5c4afe652245 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode->i_version++;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
 	  To compile this file system support as a module, choose M here: the
 	  module will be called ext3.
 
+config EXT3_DEFAULTS_TO_ORDERED
+	bool "Default to 'data=ordered' in ext3 (legacy option)"
+	depends on EXT3_FS
+	help
+	  If a filesystem does not explicitly specify a data ordering
+	  mode, and the journal capability allowed it, ext3 used to
+	  historically default to 'data=ordered'.
+
+	  That was a rather unfortunate choice, because it leads to all
+	  kinds of latency problems, and the 'data=writeback' mode is more
+	  appropriate these days.
+
+	  You should probably always answer 'n' here, and if you really
+	  want to use 'data=ordered' mode, set it in the filesystem itself
+	  with 'tune2fs -o journal_data_ordered'.
+
+	  But if you really want to enable the legacy default, you can do
+	  so by answering 'y' to this question.
+
 config EXT3_FS_XATTR
 	bool "Ext3 extended attributes"
 	depends on EXT3_FS
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 466a332e0bd1..fcfa24361856 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1521,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page,
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, inode->i_sb->s_blocksize,
 				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	} else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
-		/* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
-		return block_write_full_page(page, NULL, wbc);
+		page_bufs = page_buffers(page);
+	} else {
+		page_bufs = page_buffers(page);
+		if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+				       NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
 	}
-	page_bufs = page_buffers(page);
-
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 
 	if (IS_ERR(handle)) {
@@ -1581,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page,
 	if (ext3_journal_current_handle())
 		goto out_fail;
 
+	if (page_has_buffers(page)) {
+		if (!walk_page_buffers(NULL, page_buffers(page), 0,
+				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
+	}
+
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
 #include "acl.h"
 #include "namei.h"
 
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                    cope, else JOURNAL_DATA */
 		if (journal_check_available_features
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+			set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
 		else
 			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
 		break;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ac77d8b8251d..e3a55eb8b26a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,11 +326,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-	ext4_fsblk_t block = ext_pblock(ext);
+	ext4_fsblk_t block = ext_pblock(ext), valid_block;
 	int len = ext4_ext_get_actual_len(ext);
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-			((block + len) > ext4_blocks_count(es))))
+
+	valid_block = le32_to_cpu(es->s_first_data_block) +
+		EXT4_SB(inode->i_sb)->s_gdb_count;
+	if (unlikely(block <= valid_block ||
+		     ((block + len) > ext4_blocks_count(es))))
 		return 0;
 	else
 		return 1;
@@ -339,10 +342,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
 				struct ext4_extent_idx *ext_idx)
 {
-	ext4_fsblk_t block = idx_pblock(ext_idx);
+	ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-			(block > ext4_blocks_count(es))))
+
+	valid_block = le32_to_cpu(es->s_first_data_block) +
+		EXT4_SB(inode->i_sb)->s_gdb_count;
+	if (unlikely(block <= valid_block ||
+		     (block >= ext4_blocks_count(es))))
 		return 0;
 	else
 		return 1;
@@ -1835,11 +1841,13 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 {
 	struct ext4_ext_cache *cex;
 	BUG_ON(len == 0);
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	cex = &EXT4_I(inode)->i_cached_extent;
 	cex->ec_type = type;
 	cex->ec_block = block;
 	cex->ec_len = len;
 	cex->ec_start = start;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
 
 /*
@@ -1896,12 +1904,17 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 			struct ext4_extent *ex)
 {
 	struct ext4_ext_cache *cex;
+	int ret = EXT4_EXT_CACHE_NO;
 
+	/* 
+	 * We borrow i_block_reservation_lock to protect i_cached_extent
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	cex = &EXT4_I(inode)->i_cached_extent;
 
 	/* has cache valid data? */
 	if (cex->ec_type == EXT4_EXT_CACHE_NO)
-		return EXT4_EXT_CACHE_NO;
+		goto errout;
 
 	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
 			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
@@ -1912,11 +1925,11 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 		ext_debug("%u cached by %u:%u:%llu\n",
 				block,
 				cex->ec_block, cex->ec_len, cex->ec_start);
-		return cex->ec_type;
+		ret = cex->ec_type;
 	}
-
-	/* not in cache */
-	return EXT4_EXT_CACHE_NO;
+errout:
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	return ret;
 }
 
 /*
@@ -2416,8 +2429,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 			len = ee_len;
 
 		bio = bio_alloc(GFP_NOIO, len);
-		if (!bio)
-			return -ENOMEM;
 		bio->bi_sector = ee_pblock;
 		bio->bi_bdev   = inode->i_sb->s_bdev;
 
@@ -2871,6 +2882,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				if (allocated > max_blocks)
 					allocated = max_blocks;
 				set_buffer_unwritten(bh_result);
+				bh_result->b_bdev = inode->i_sb->s_bdev;
+				bh_result->b_blocknr = newblock;
 				goto out2;
 			}
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 47b84e8df568..f18e0a08a6b5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -585,6 +585,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 fallback:
 	ngroups = sbi->s_groups_count;
 	avefreei = freei / ngroups;
+fallback_retry:
 	parent_group = EXT4_I(parent)->i_block_group;
 	for (i = 0; i < ngroups; i++) {
 		grp = (parent_group + i) % ngroups;
@@ -602,7 +603,7 @@ fallback:
 		 * filesystems the above test can fail to find any blockgroups
 		 */
 		avefreei = 0;
-		goto fallback;
+		goto fallback_retry;
 	}
 
 	return -1;
@@ -831,11 +832,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		ret2 = find_group_flex(sb, dir, &group);
 		if (ret2 == -1) {
 			ret2 = find_group_other(sb, dir, &group, mode);
-			if (ret2 == 0 && once)
+			if (ret2 == 0 && once) {
 				once = 0;
 				printk(KERN_NOTICE "ext4: find_group_flex "
 				       "failed, fallback succeeded dir %lu\n",
 				       dir->i_ino);
+			}
 		}
 		goto got_group;
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a2e7952bc5f9..2a9ffd528dd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode,
 }
 
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-				 unsigned int *p, unsigned int max) {
+				 __le32 *p, unsigned int max) {
 
 	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
-	unsigned int *bref = p;
+	__le32 *bref = p;
 	while (bref < p+max) {
-		if (unlikely(*bref >= maxblocks)) {
+		if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
 			ext4_error(inode->i_sb, function,
 				   "block reference %u >= max (%u) "
 				   "in inode #%lu, offset=%d",
-				   *bref, maxblocks,
+				   le32_to_cpu(*bref), maxblocks,
 				   inode->i_ino, (int)(bref-p));
  			return -EIO;
  		}
@@ -1149,6 +1149,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 	int retval;
 
 	clear_buffer_mapped(bh);
+	clear_buffer_unwritten(bh);
 
 	/*
 	 * Try to see if we can get  the block without requesting
@@ -1179,6 +1180,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		return retval;
 
 	/*
+	 * When we call get_blocks without the create flag, the
+	 * BH_Unwritten flag could have gotten set if the blocks
+	 * requested were part of a uninitialized extent.  We need to
+	 * clear this flag now that we are committed to convert all or
+	 * part of the uninitialized extent to be an initialized
+	 * extent.  This is because we need to avoid the combination
+	 * of BH_Unwritten and BH_Mapped flags being simultaneously
+	 * set on the buffer_head.
+	 */
+	clear_buffer_unwritten(bh);
+
+	/*
 	 * New blocks allocate and/or writing to uninitialized extent
 	 * will possibly result in updating i_data, so we take
 	 * the write lock of i_data_sem, and call get_blocks()
@@ -2297,6 +2310,10 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh_result, int create)
 {
 	int ret = 0;
+	sector_t invalid_block = ~((sector_t) 0xffff);
+
+	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+		invalid_block = ~0;
 
 	BUG_ON(create == 0);
 	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
@@ -2318,11 +2335,18 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 			/* not enough space to reserve */
 			return ret;
 
-		map_bh(bh_result, inode->i_sb, 0);
+		map_bh(bh_result, inode->i_sb, invalid_block);
 		set_buffer_new(bh_result);
 		set_buffer_delay(bh_result);
 	} else if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
+		/*
+		 * With sub-block writes into unwritten extents
+		 * we also need to mark the buffer as new so that
+		 * the unwritten parts of the buffer gets correctly zeroed.
+		 */
+		if (buffer_unwritten(bh_result))
+			set_buffer_new(bh_result);
 		ret = 0;
 	}
 
@@ -4357,11 +4381,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
-	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-	}
 	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
@@ -4409,9 +4431,23 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 
-	if (ei->i_flags & EXT4_EXTENTS_FL) {
-		/* Validate extent which is part of inode */
-		ret = ext4_ext_check_inode(inode);
+	ret = 0;
+	if (ei->i_file_acl &&
+	    ((ei->i_file_acl < 
+	      (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+	       EXT4_SB(sb)->s_gdb_count)) ||
+	     (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+		ext4_error(sb, __func__,
+			   "bad extended attribute block %llu in inode #%lu",
+			   ei->i_file_acl, inode->i_ino);
+		ret = -EIO;
+		goto bad_inode;
+	} else if (ei->i_flags & EXT4_EXTENTS_FL) {
+		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		    (S_ISLNK(inode->i_mode) &&
+		     !ext4_inode_is_fast_symlink(inode)))
+			/* Validate extent which is part of inode */
+			ret = ext4_ext_check_inode(inode);
  	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		   (S_ISLNK(inode->i_mode) &&
 		    !ext4_inode_is_fast_symlink(inode))) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9987bba99db3..2958f4e6f222 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 
+	/* check blocks count against device size */
+	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)\n",
+		       ext4_blocks_count(es), blocks_count);
+		goto failed_mount;
+	}
+
         /*
          * It makes no sense for the first data block to be beyond the end
          * of the filesystem.
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index d0a69ff25375..182f9ffe2b51 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET
 	  Note that "utf8" is not recommended for FAT filesystems.
 	  If unsure, you shouldn't set "utf8" here.
 	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+	  Enable any character sets you need in File Systems/Native Language
+	  Support.
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cc8e4de2fee5..1ad703150dee 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -117,11 +117,13 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
+		int retval = oldfd;
+
 		rcu_read_lock();
 		if (!fcheck_files(files, oldfd))
-			oldfd = -EBADF;
+			retval = -EBADF;
 		rcu_read_unlock();
-		return oldfd;
+		return retval;
 	}
 	return sys_dup3(oldfd, newfd, 0);
 }
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa70260e6d1..a24c58e181db 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 	return retval;
 }
 
-int get_filesystem_list(char * buf)
+int __init get_filesystem_list(char *buf)
 {
 	int len = 0;
 	struct file_system_type * tmp;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2b25133524a3..06f30e965676 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -938,9 +938,9 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 }
 
 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
-			       unsigned *nbytesp, int write)
+			       size_t *nbytesp, int write)
 {
-	unsigned nbytes = *nbytesp;
+	size_t nbytes = *nbytesp;
 	unsigned long user_addr = (unsigned long) buf;
 	unsigned offset = user_addr & ~PAGE_MASK;
 	int npages;
@@ -955,7 +955,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
 		return 0;
 	}
 
-	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
 	down_read(&current->mm->mmap_sem);
@@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_flags & VM_MAYSHARE)
 		return -ENODEV;
 
+	invalidate_inode_pages2(file->f_mapping);
+
 	return generic_file_mmap(file, vma);
 }
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 459b73dd45e1..91f7c85f1ffd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,6 +19,7 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
+#include <linux/smp_lock.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -259,7 +260,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 
 static void fuse_umount_begin(struct super_block *sb)
 {
+	lock_kernel();
 	fuse_abort_conn(get_fuse_conn_super(sb));
+	unlock_kernel();
 }
 
 static void fuse_send_destroy(struct fuse_conn *fc)
@@ -908,6 +911,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
  err_put_root:
 	dput(root_dentry);
  err_put_conn:
+	bdi_destroy(&fc->bdi);
 	fuse_conn_put(fc);
  err_fput:
 	fput(file);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3984e47d1d33..ff4981090489 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -597,7 +597,6 @@ __acquires(&gl->gl_spin)
 
 	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
 
-	down_read(&gfs2_umount_flush_sem);
 	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
 	    gl->gl_demote_state != gl->gl_state) {
 		if (find_first_holder(gl))
@@ -614,15 +613,14 @@ __acquires(&gl->gl_spin)
 		if (ret == 0)
 			goto out_unlock;
 		if (ret == 2)
-			goto out_sem;
+			goto out;
 		gh = find_first_waiter(gl);
 		gl->gl_target = gh->gh_state;
 		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 			do_error(gl, 0); /* Fail queued try locks */
 	}
 	do_xmote(gl, gh, gl->gl_target);
-out_sem:
-	up_read(&gfs2_umount_flush_sem);
+out:
 	return;
 
 out_sched:
@@ -631,7 +629,7 @@ out_sched:
 		gfs2_glock_put(gl);
 out_unlock:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	goto out_sem;
+	goto out;
 }
 
 static void glock_work_func(struct work_struct *work)
@@ -641,6 +639,7 @@ static void glock_work_func(struct work_struct *work)
 
 	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
 		finish_xmote(gl, gl->gl_reply);
+	down_read(&gfs2_umount_flush_sem);
 	spin_lock(&gl->gl_spin);
 	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
 	    gl->gl_state != LM_ST_UNLOCKED &&
@@ -653,6 +652,7 @@ static void glock_work_func(struct work_struct *work)
 	}
 	run_queue(gl, 0);
 	spin_unlock(&gl->gl_spin);
+	up_read(&gfs2_umount_flush_sem);
 	if (!delay ||
 	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
@@ -1304,6 +1304,7 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 				nr--;
 				if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 					gfs2_glock_put(gl);
+				got_ref = 0;
 			}
 			spin_lock(&lru_lock);
 			if (may_demote)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index bf23a62aa925..70f87f43afa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl)
 	error = filemap_fdatawait(metamapping);
 	mapping_set_error(metamapping, error);
 	gfs2_ail_empty_gl(gl);
+	/*
+	 * Writeback of the data mapping may cause the dirty flag to be set
+	 * so we have to clear it again here.
+	 */
+	smp_mb__before_clear_bit();
+	clear_bit(GLF_DIRTY, &gl->gl_flags);
 }
 
 /**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7b277d449155..5a31d426116f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -137,15 +137,15 @@ void gfs2_set_iop(struct inode *inode)
 	if (S_ISREG(mode)) {
 		inode->i_op = &gfs2_file_iops;
 		if (gfs2_localflocks(sdp))
-			inode->i_fop = gfs2_file_fops_nolock;
+			inode->i_fop = &gfs2_file_fops_nolock;
 		else
-			inode->i_fop = gfs2_file_fops;
+			inode->i_fop = &gfs2_file_fops;
 	} else if (S_ISDIR(mode)) {
 		inode->i_op = &gfs2_dir_iops;
 		if (gfs2_localflocks(sdp))
-			inode->i_fop = gfs2_dir_fops_nolock;
+			inode->i_fop = &gfs2_dir_fops_nolock;
 		else
-			inode->i_fop = gfs2_dir_fops;
+			inode->i_fop = &gfs2_dir_fops;
 	} else if (S_ISLNK(mode)) {
 		inode->i_op = &gfs2_symlink_iops;
 	} else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index dca4fee3078b..c30be2b66580 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,21 +101,23 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations *gfs2_file_fops_nolock;
-extern const struct file_operations *gfs2_dir_fops_nolock;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
 
 extern void gfs2_set_inode_flags(struct inode *inode);
  
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
-extern const struct file_operations *gfs2_file_fops;
-extern const struct file_operations *gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+
 static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
 {
 	return sdp->sd_args.ar_localflocks;
 }
 #else /* Single node only */
-#define gfs2_file_fops NULL
-#define gfs2_dir_fops NULL
+#define gfs2_file_fops gfs2_file_fops_nolock
+#define gfs2_dir_fops gfs2_dir_fops_nolock
+
 static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
 {
 	return 1;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 70b9b8548945..5d82e91887e3 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -413,7 +413,9 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
-	if (ret)
+	if (ret == -ENOMEM)
+		ret = VM_FAULT_OOM;
+	else if (ret)
 		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
@@ -705,7 +707,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	}
 }
 
-const struct file_operations *gfs2_file_fops = &(const struct file_operations){
+const struct file_operations gfs2_file_fops = {
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -723,7 +725,7 @@ const struct file_operations *gfs2_file_fops = &(const struct file_operations){
 	.setlease	= gfs2_setlease,
 };
 
-const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
+const struct file_operations gfs2_dir_fops = {
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
@@ -735,7 +737,7 @@ const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
 
 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
 
-const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
+const struct file_operations gfs2_file_fops_nolock = {
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -751,7 +753,7 @@ const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operat
 	.setlease	= generic_setlease,
 };
 
-const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
+const struct file_operations gfs2_dir_fops_nolock = {
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 51883b3ad89c..1ff9473ea753 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -272,11 +272,6 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 	lock_page(page);
 
 	bio = bio_alloc(GFP_NOFS, 1);
-	if (unlikely(!bio)) {
-		__free_page(page);
-		return -ENOBUFS;
-	}
-
 	bio->bi_sector = sector * (sb->s_blocksize >> 9);
 	bio->bi_bdev = sb->s_bdev;
 	bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -1287,21 +1282,21 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 static struct super_block *get_gfs2_sb(const char *dev_name)
 {
 	struct super_block *sb;
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+	error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
 		return NULL;
 	}
-	sb = nd.path.dentry->d_inode->i_sb;
+	sb = path.dentry->d_inode->i_sb;
 	if (sb && (sb->s_type == &gfs2_fs_type))
 		atomic_inc(&sb->s_active);
 	else
 		sb = NULL;
-	path_put(&nd.path);
+	path_put(&path);
 	return sb;
 }
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index abd5429ae285..1c70fa5168d6 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -371,6 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 	ip = ghs[1].gh_gl->gl_object;
 
 	ip->i_disksize = size;
+	i_size_write(inode, size);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8d53f66b5bcc..152e6c4a0dca 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -81,7 +81,7 @@ struct gfs2_quota_change_host {
 
 static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
-static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(qd_lru_lock);
 
 int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
 {
@@ -1364,7 +1364,7 @@ int gfs2_quotad(void *data)
 			refrigerator();
 		t = min(quotad_timeo, statfs_timeo);
 
-		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
 		spin_lock(&sdp->sd_trunc_lock);
 		empty = list_empty(&sdp->sd_trunc_list);
 		spin_unlock(&sdp->sd_trunc_lock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f03d024038ea..565038243fa2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 	if (tmp == 0)
 		return BFITNOENT;
 	ptr--;
-	bit = fls64(tmp);
-	bit--;		/* fls64 always adds one to the bit count */
+	bit = __ffs64(tmp);
 	bit /= 2;	/* two bits per entry in the bitmap */
 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
 }
@@ -1445,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_rgrpd *rgd = al->al_rgd;
 	u32 goal, blk;
 	u64 block;
+	int error;
 
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
@@ -1461,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
 	ip->i_goal = block;
-
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error == 0) {
+		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
+		brelse(dibh);
+	}
 	gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
 	rgd->rd_free -= *n;
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9435dda8f1e0..a1cbff2b4d99 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 		BUG();
 		return 0;
 	}
+
+	if (!tree)
+		return 0;
+
 	if (tree->node_size >= PAGE_CACHE_SIZE) {
 		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 36ca2e1a4fa3..7b6165f25fbe 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb)
 	if (HFS_SB(sb)->nls_disk)
 		unload_nls(HFS_SB(sb)->nls_disk);
 
+	free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
 	kfree(HFS_SB(sb));
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fecf402d7b8a..fc77965be841 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -423,8 +423,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	if (!(*flags & MS_RDONLY)) mark_dirty(s);
 
-	kfree(s->s_options);
-	s->s_options = new_opts;
+	replace_mount_options(s, new_opts);
 
 	return 0;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 23a3c76711e0..c1462d43e721 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/pagevec.h>
 #include <linux/parser.h>
 #include <linux/mman.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
@@ -313,16 +312,6 @@ out:
 	return retval;
 }
 
-/*
- * Read a page. Again trivial. If it didn't already exist
- * in the page cache, it is zero-filled.
- */
-static int hugetlbfs_readpage(struct file *file, struct page * page)
-{
-	unlock_page(page);
-	return -EINVAL;
-}
-
 static int hugetlbfs_write_begin(struct file *file,
 			struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
@@ -702,7 +691,6 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
 }
 
 static const struct address_space_operations hugetlbfs_aops = {
-	.readpage	= hugetlbfs_readpage,
 	.write_begin	= hugetlbfs_write_begin,
 	.write_end	= hugetlbfs_write_end,
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
@@ -842,7 +830,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 bad_val:
  	printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
 	       args[0].from, p);
- 	return 1;
+ 	return -EINVAL;
 }
 
 static int
diff --git a/fs/inode.c b/fs/inode.c
index d06d6d268de9..0571983755dc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -99,7 +99,7 @@ static DEFINE_MUTEX(iprune_mutex);
  */
 struct inodes_stat_t inodes_stat;
 
-static struct kmem_cache * inode_cachep __read_mostly;
+static struct kmem_cache *inode_cachep __read_mostly;
 
 static void wake_up_inode(struct inode *inode)
 {
@@ -124,7 +124,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	static struct inode_operations empty_iops;
 	static const struct file_operations empty_fops;
 
-	struct address_space * const mapping = &inode->i_data;
+	struct address_space *const mapping = &inode->i_data;
 
 	inode->i_sb = sb;
 	inode->i_blkbits = sb->s_blocksize_bits;
@@ -216,7 +216,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 	return NULL;
 }
 
-void destroy_inode(struct inode *inode) 
+void destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
@@ -252,12 +252,11 @@ void inode_init_once(struct inode *inode)
 	mutex_init(&inode->inotify_mutex);
 #endif
 }
-
 EXPORT_SYMBOL(inode_init_once);
 
 static void init_once(void *foo)
 {
-	struct inode * inode = (struct inode *) foo;
+	struct inode *inode = (struct inode *) foo;
 
 	inode_init_once(inode);
 }
@@ -265,7 +264,7 @@ static void init_once(void *foo)
 /*
  * inode_lock must be held
  */
-void __iget(struct inode * inode)
+void __iget(struct inode *inode)
 {
 	if (atomic_read(&inode->i_count)) {
 		atomic_inc(&inode->i_count);
@@ -289,7 +288,7 @@ void clear_inode(struct inode *inode)
 {
 	might_sleep();
 	invalidate_inode_buffers(inode);
-       
+
 	BUG_ON(inode->i_data.nrpages);
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
@@ -303,7 +302,6 @@ void clear_inode(struct inode *inode)
 		cd_forget(inode);
 	inode->i_state = I_CLEAR;
 }
-
 EXPORT_SYMBOL(clear_inode);
 
 /*
@@ -351,8 +349,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 
 	next = head->next;
 	for (;;) {
-		struct list_head * tmp = next;
-		struct inode * inode;
+		struct list_head *tmp = next;
+		struct inode *inode;
 
 		/*
 		 * We can reschedule here without worrying about the list's
@@ -391,7 +389,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
  *	fails because there are busy inodes then a non zero value is returned.
  *	If the discard is successful all the inodes have been discarded.
  */
-int invalidate_inodes(struct super_block * sb)
+int invalidate_inodes(struct super_block *sb)
 {
 	int busy;
 	LIST_HEAD(throw_away);
@@ -407,7 +405,6 @@ int invalidate_inodes(struct super_block * sb)
 
 	return busy;
 }
-
 EXPORT_SYMBOL(invalidate_inodes);
 
 static int can_unuse(struct inode *inode)
@@ -504,7 +501,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
 		 * Nasty deadlock avoidance.  We may hold various FS locks,
 		 * and we don't want to recurse into the FS that called us
 		 * in clear_inode() and friends..
-	 	 */
+		 */
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 		prune_icache(nr);
@@ -524,10 +521,13 @@ static void __wait_on_freeing_inode(struct inode *inode);
  * by hand after calling find_inode now! This simplifies iunique and won't
  * add any additional branch in the common code.
  */
-static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+static struct inode *find_inode(struct super_block *sb,
+				struct hlist_head *head,
+				int (*test)(struct inode *, void *),
+				void *data)
 {
 	struct hlist_node *node;
-	struct inode * inode = NULL;
+	struct inode *inode = NULL;
 
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
@@ -548,10 +548,11 @@ repeat:
  * find_inode_fast is the fast path version of find_inode, see the comment at
  * iget_locked for details.
  */
-static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+static struct inode *find_inode_fast(struct super_block *sb,
+				struct hlist_head *head, unsigned long ino)
 {
 	struct hlist_node *node;
-	struct inode * inode = NULL;
+	struct inode *inode = NULL;
 
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
@@ -631,10 +632,10 @@ struct inode *new_inode(struct super_block *sb)
 	 * here to attempt to avoid that.
 	 */
 	static unsigned int last_ino;
-	struct inode * inode;
+	struct inode *inode;
 
 	spin_lock_prefetch(&inode_lock);
-	
+
 	inode = alloc_inode(sb);
 	if (inode) {
 		spin_lock(&inode_lock);
@@ -645,7 +646,6 @@ struct inode *new_inode(struct super_block *sb)
 	}
 	return inode;
 }
-
 EXPORT_SYMBOL(new_inode);
 
 void unlock_new_inode(struct inode *inode)
@@ -674,7 +674,6 @@ void unlock_new_inode(struct inode *inode)
 	inode->i_state &= ~(I_LOCK|I_NEW);
 	wake_up_inode(inode);
 }
-
 EXPORT_SYMBOL(unlock_new_inode);
 
 /*
@@ -683,13 +682,17 @@ EXPORT_SYMBOL(unlock_new_inode);
  * We no longer cache the sb_flags in i_flags - see fs.h
  *	-- rmk@arm.uk.linux.org
  */
-static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+static struct inode *get_new_inode(struct super_block *sb,
+				struct hlist_head *head,
+				int (*test)(struct inode *, void *),
+				int (*set)(struct inode *, void *),
+				void *data)
 {
-	struct inode * inode;
+	struct inode *inode;
 
 	inode = alloc_inode(sb);
 	if (inode) {
-		struct inode * old;
+		struct inode *old;
 
 		spin_lock(&inode_lock);
 		/* We released the lock, so.. */
@@ -731,13 +734,14 @@ set_failed:
  * get_new_inode_fast is the fast path version of get_new_inode, see the
  * comment at iget_locked for details.
  */
-static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+static struct inode *get_new_inode_fast(struct super_block *sb,
+				struct hlist_head *head, unsigned long ino)
 {
-	struct inode * inode;
+	struct inode *inode;
 
 	inode = alloc_inode(sb);
 	if (inode) {
-		struct inode * old;
+		struct inode *old;
 
 		spin_lock(&inode_lock);
 		/* We released the lock, so.. */
@@ -823,7 +827,6 @@ struct inode *igrab(struct inode *inode)
 	spin_unlock(&inode_lock);
 	return inode;
 }
-
 EXPORT_SYMBOL(igrab);
 
 /**
@@ -924,7 +927,6 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 
 	return ifind(sb, head, test, data, 0);
 }
-
 EXPORT_SYMBOL(ilookup5_nowait);
 
 /**
@@ -953,7 +955,6 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 
 	return ifind(sb, head, test, data, 1);
 }
-
 EXPORT_SYMBOL(ilookup5);
 
 /**
@@ -976,7 +977,6 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 
 	return ifind_fast(sb, head, ino);
 }
-
 EXPORT_SYMBOL(ilookup);
 
 /**
@@ -1015,7 +1015,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
 	 */
 	return get_new_inode(sb, head, test, set, data);
 }
-
 EXPORT_SYMBOL(iget5_locked);
 
 /**
@@ -1047,7 +1046,6 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 	 */
 	return get_new_inode_fast(sb, head, ino);
 }
-
 EXPORT_SYMBOL(iget_locked);
 
 int insert_inode_locked(struct inode *inode)
@@ -1076,7 +1074,6 @@ int insert_inode_locked(struct inode *inode)
 		iput(old);
 	}
 }
-
 EXPORT_SYMBOL(insert_inode_locked);
 
 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
@@ -1106,7 +1103,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 		iput(old);
 	}
 }
-
 EXPORT_SYMBOL(insert_inode_locked4);
 
 /**
@@ -1124,7 +1120,6 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 	hlist_add_head(&inode->i_hash, head);
 	spin_unlock(&inode_lock);
 }
-
 EXPORT_SYMBOL(__insert_inode_hash);
 
 /**
@@ -1139,7 +1134,6 @@ void remove_inode_hash(struct inode *inode)
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode_lock);
 }
-
 EXPORT_SYMBOL(remove_inode_hash);
 
 /*
@@ -1187,7 +1181,6 @@ void generic_delete_inode(struct inode *inode)
 	BUG_ON(inode->i_state != I_CLEAR);
 	destroy_inode(inode);
 }
-
 EXPORT_SYMBOL(generic_delete_inode);
 
 static void generic_forget_inode(struct inode *inode)
@@ -1237,12 +1230,11 @@ void generic_drop_inode(struct inode *inode)
 	else
 		generic_forget_inode(inode);
 }
-
 EXPORT_SYMBOL_GPL(generic_drop_inode);
 
 /*
  * Called when we're dropping the last reference
- * to an inode. 
+ * to an inode.
  *
  * Call the FS "drop()" function, defaulting to
  * the legacy UNIX filesystem behaviour..
@@ -1262,7 +1254,7 @@ static inline void iput_final(struct inode *inode)
 }
 
 /**
- *	iput	- put an inode 
+ *	iput	- put an inode
  *	@inode: inode to put
  *
  *	Puts an inode, dropping its usage count. If the inode use count hits
@@ -1279,7 +1271,6 @@ void iput(struct inode *inode)
 			iput_final(inode);
 	}
 }
-
 EXPORT_SYMBOL(iput);
 
 /**
@@ -1290,10 +1281,10 @@ EXPORT_SYMBOL(iput);
  *	Returns the block number on the device holding the inode that
  *	is the disk block number for the block of the file requested.
  *	That is, asked for block 4 of inode 1 the function will return the
- *	disk block relative to the disk start that holds that block of the 
+ *	disk block relative to the disk start that holds that block of the
  *	file.
  */
-sector_t bmap(struct inode * inode, sector_t block)
+sector_t bmap(struct inode *inode, sector_t block)
 {
 	sector_t res = 0;
 	if (inode->i_mapping->a_ops->bmap)
@@ -1425,7 +1416,6 @@ void file_update_time(struct file *file)
 		mark_inode_dirty_sync(inode);
 	mnt_drop_write(file->f_path.mnt);
 }
-
 EXPORT_SYMBOL(file_update_time);
 
 int inode_needs_sync(struct inode *inode)
@@ -1436,7 +1426,6 @@ int inode_needs_sync(struct inode *inode)
 		return 1;
 	return 0;
 }
-
 EXPORT_SYMBOL(inode_needs_sync);
 
 int inode_wait(void *word)
@@ -1470,42 +1459,6 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	spin_lock(&inode_lock);
 }
 
-/*
- * We rarely want to lock two inodes that do not have a parent/child
- * relationship (such as directory, child inode) simultaneously. The
- * vast majority of file systems should be able to get along fine
- * without this. Do not use these functions except as a last resort.
- */
-void inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
-	if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
-		if (inode1)
-			mutex_lock(&inode1->i_mutex);
-		else if (inode2)
-			mutex_lock(&inode2->i_mutex);
-		return;
-	}
-
-	if (inode1 < inode2) {
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
-	} else {
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
-	}
-}
-EXPORT_SYMBOL(inode_double_lock);
-
-void inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
-	if (inode1)
-		mutex_unlock(&inode1->i_mutex);
-
-	if (inode2 && inode2 != inode1)
-		mutex_unlock(&inode2->i_mutex);
-}
-EXPORT_SYMBOL(inode_double_unlock);
-
 static __initdata unsigned long ihash_entries;
 static int __init set_ihash_entries(char *str)
 {
diff --git a/fs/ioctl.c b/fs/ioctl.c
index ac2d47e43926..82d9c42b8bac 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -258,7 +258,7 @@ int __generic_block_fiemap(struct inode *inode,
 	long long length = 0, map_len = 0;
 	u64 logical = 0, phys = 0, size = 0;
 	u32 flags = FIEMAP_EXTENT_MERGED;
-	int ret = 0;
+	int ret = 0, past_eof = 0, whole_file = 0;
 
 	if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
 		return ret;
@@ -266,6 +266,9 @@ int __generic_block_fiemap(struct inode *inode,
 	start_blk = logical_to_blk(inode, start);
 
 	length = (long long)min_t(u64, len, i_size_read(inode));
+	if (length < len)
+		whole_file = 1;
+
 	map_len = length;
 
 	do {
@@ -282,11 +285,26 @@ int __generic_block_fiemap(struct inode *inode,
 
 		/* HOLE */
 		if (!buffer_mapped(&tmp)) {
+			length -= blk_to_logical(inode, 1);
+			start_blk++;
+
+			/*
+			 * we want to handle the case where there is an
+			 * allocated block at the front of the file, and then
+			 * nothing but holes up to the end of the file properly,
+			 * to make sure that extent at the front gets properly
+			 * marked with FIEMAP_EXTENT_LAST
+			 */
+			if (!past_eof &&
+			    blk_to_logical(inode, start_blk) >=
+			    blk_to_logical(inode, 0)+i_size_read(inode))
+				past_eof = 1;
+
 			/*
 			 * first hole after going past the EOF, this is our
 			 * last extent
 			 */
-			if (length <= 0) {
+			if (past_eof && size) {
 				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
 				ret = fiemap_fill_next_extent(fieinfo, logical,
 							      phys, size,
@@ -294,15 +312,37 @@ int __generic_block_fiemap(struct inode *inode,
 				break;
 			}
 
-			length -= blk_to_logical(inode, 1);
-
 			/* if we have holes up to/past EOF then we're done */
-			if (length <= 0)
+			if (length <= 0 || past_eof)
 				break;
-
-			start_blk++;
 		} else {
-			if (length <= 0 && size) {
+			/*
+			 * we have gone over the length of what we wanted to
+			 * map, and it wasn't the entire file, so add the extent
+			 * we got last time and exit.
+			 *
+			 * This is for the case where say we want to map all the
+			 * way up to the second to the last block in a file, but
+			 * the last block is a hole, making the second to last
+			 * block FIEMAP_EXTENT_LAST.  In this case we want to
+			 * see if there is a hole after the second to last block
+			 * so we can mark it properly.  If we found data after
+			 * we exceeded the length we were requesting, then we
+			 * are good to go, just add the extent to the fieinfo
+			 * and break
+			 */
+			if (length <= 0 && !whole_file) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				break;
+			}
+
+			/*
+			 * if size != 0 then we know we already have an extent
+			 * to add, so add it.
+			 */
+			if (size) {
 				ret = fiemap_fill_next_extent(fieinfo, logical,
 							      phys, size,
 							      flags);
@@ -319,19 +359,14 @@ int __generic_block_fiemap(struct inode *inode,
 			start_blk += logical_to_blk(inode, size);
 
 			/*
-			 * if we are past the EOF we need to loop again to see
-			 * if there is a hole so we can mark this extent as the
-			 * last one, and if not keep mapping things until we
-			 * find a hole, or we run out of slots in the extent
-			 * array
+			 * If we are past the EOF, then we need to make sure as
+			 * soon as we find a hole that the last extent we found
+			 * is marked with FIEMAP_EXTENT_LAST
 			 */
-			if (length <= 0)
-				continue;
-
-			ret = fiemap_fill_next_extent(fieinfo, logical, phys,
-						      size, flags);
-			if (ret)
-				break;
+			if (!past_eof &&
+			    logical+size >=
+			    blk_to_logical(inode, 0)+i_size_read(inode))
+				past_eof = 1;
 		}
 		cond_resched();
 	} while (1);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9c8981..06560c520f49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
+		write_op = WRITE_SYNC_PLUG;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -497,7 +502,7 @@ void journal_commit_transaction(journal_t *journal)
 		err = 0;
 	}
 
-	journal_write_revoke_records(journal, commit_transaction);
+	journal_write_revoke_records(journal, commit_transaction, write_op);
 
 	/*
 	 * If we found any dirty or locked buffers, then we should have
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbdc..da6cd9bdaabc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -55,6 +55,25 @@
  *			need do nothing.
  * RevokeValid set, Revoked set:
  *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
  */
 
 #ifndef __KERNEL__
@@ -67,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
 
@@ -99,8 +119,8 @@ struct jbd_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
 				    struct journal_head **, int *,
-				    struct jbd_revoke_record_s *);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+				    struct jbd_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -402,8 +422,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
  * the second time we would still have a pending revoke to cancel.  So,
  * do not trust the Revoked bit on buffers unless RevokeValid is also
  * set.
- *
- * The caller must have the journal locked.
  */
 int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -481,12 +499,9 @@ void journal_switch_revoke_table(journal_t *journal)
 /*
  * Write revoke records to the journal for all entries in the current
  * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
  */
-
 void journal_write_revoke_records(journal_t *journal,
-				  transaction_t *transaction)
+				  transaction_t *transaction, int write_op)
 {
 	struct journal_head *descriptor;
 	struct jbd_revoke_record_s *record;
@@ -510,14 +525,14 @@ void journal_write_revoke_records(journal_t *journal,
 				hash_list->next;
 			write_one_revoke_record(journal, transaction,
 						&descriptor, &offset,
-						record);
+						record, write_op);
 			count++;
 			list_del(&record->hash);
 			kmem_cache_free(revoke_record_cache, record);
 		}
 	}
 	if (descriptor)
-		flush_descriptor(journal, descriptor, offset);
+		flush_descriptor(journal, descriptor, offset, write_op);
 	jbd_debug(1, "Wrote %d revoke records\n", count);
 }
 
@@ -530,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
 				    struct journal_head **descriptorp,
 				    int *offsetp,
-				    struct jbd_revoke_record_s *record)
+				    struct jbd_revoke_record_s *record,
+				    int write_op)
 {
 	struct journal_head *descriptor;
 	int offset;
@@ -549,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
 	/* Make sure we have a descriptor with space left for the record */
 	if (descriptor) {
 		if (offset == journal->j_blocksize) {
-			flush_descriptor(journal, descriptor, offset);
+			flush_descriptor(journal, descriptor, offset, write_op);
 			descriptor = NULL;
 		}
 	}
@@ -586,7 +602,7 @@ static void write_one_revoke_record(journal_t *journal,
 
 static void flush_descriptor(journal_t *journal,
 			     struct journal_head *descriptor,
-			     int offset)
+			     int offset, int write_op)
 {
 	journal_revoke_header_t *header;
 	struct buffer_head *bh = jh2bh(descriptor);
@@ -601,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block(SWRITE, 1, &bh);
+	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..0b7d3b8226fd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
-	ret = submit_bh(WRITE_SYNC, bh);
+	ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	if (barrier_done)
 		clear_buffer_ordered(bh);
 
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		clear_buffer_dirty(bh);
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	}
 	*cbh = bh;
 	return ret;
@@ -190,7 +190,7 @@ retry:
 		set_buffer_uptodate(bh);
 		bh->b_end_io = journal_end_buffer_io_sync;
 
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 		if (ret) {
 			unlock_buffer(bh);
 			return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
+		write_op = WRITE_SYNC_PLUG;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -501,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-	jbd2_journal_write_revoke_records(journal, commit_transaction);
+	jbd2_journal_write_revoke_records(journal, commit_transaction,
+					  write_op);
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index bbe6d592d8b3..a360b06af2e3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -86,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
 
@@ -118,8 +119,8 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
 				    struct journal_head **, int *,
-				    struct jbd2_revoke_record_s *);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+				    struct jbd2_revoke_record_s *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -499,7 +500,8 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
  * revoke hash, deleting the entries as we go.
  */
 void jbd2_journal_write_revoke_records(journal_t *journal,
-				  transaction_t *transaction)
+				       transaction_t *transaction,
+				       int write_op)
 {
 	struct journal_head *descriptor;
 	struct jbd2_revoke_record_s *record;
@@ -523,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 				hash_list->next;
 			write_one_revoke_record(journal, transaction,
 						&descriptor, &offset,
-						record);
+						record, write_op);
 			count++;
 			list_del(&record->hash);
 			kmem_cache_free(jbd2_revoke_record_cache, record);
 		}
 	}
 	if (descriptor)
-		flush_descriptor(journal, descriptor, offset);
+		flush_descriptor(journal, descriptor, offset, write_op);
 	jbd_debug(1, "Wrote %d revoke records\n", count);
 }
 
@@ -543,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
 				    struct journal_head **descriptorp,
 				    int *offsetp,
-				    struct jbd2_revoke_record_s *record)
+				    struct jbd2_revoke_record_s *record,
+				    int write_op)
 {
 	struct journal_head *descriptor;
 	int offset;
@@ -562,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
 	/* Make sure we have a descriptor with space left for the record */
 	if (descriptor) {
 		if (offset == journal->j_blocksize) {
-			flush_descriptor(journal, descriptor, offset);
+			flush_descriptor(journal, descriptor, offset, write_op);
 			descriptor = NULL;
 		}
 	}
@@ -607,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal,
 
 static void flush_descriptor(journal_t *journal,
 			     struct journal_head *descriptor,
-			     int offset)
+			     int offset, int write_op)
 {
 	jbd2_journal_revoke_header_t *header;
 	struct buffer_head *bh = jh2bh(descriptor);
@@ -622,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block(SWRITE, 1, &bh);
+	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
 
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 77ccf8cb0823..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
 	size_t s;
 
 	size -= sizeof(struct jffs2_acl_header);
-	s = size - 4 * sizeof(struct jffs2_acl_entry_short);
-	if (s < 0) {
+	if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
 		if (size % sizeof(struct jffs2_acl_entry_short))
 			return -1;
 		return size / sizeof(struct jffs2_acl_entry_short);
 	} else {
+		s = size - 4 * sizeof(struct jffs2_acl_entry_short);
 		if (s % sizeof(struct jffs2_acl_entry))
 			return -1;
 		return s / sizeof(struct jffs2_acl_entry) + 4;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 {
 	struct jffs2_xattr_datum *xd;
-	xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+	xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", xd);
 
-	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
 	xd->class = RAWNODE_CLASS_XATTR_DATUM;
 	xd->node = (void *)xd;
 	INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
 struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 {
 	struct jffs2_xattr_ref *ref;
-	ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+	ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", ref);
 
-	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
 	ref->class = RAWNODE_CLASS_XATTR_REF;
 	ref->node = (void *)ref;
 	return ref;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..80046ddf5063 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,8 +246,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	return 0;
 
 Enomem:
-	up_write(&s->s_umount);
-	deactivate_super(s);
+	deactivate_locked_super(s);
 	return -ENOMEM;
 }
 
@@ -575,6 +574,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
  * possibly a read which collects the result - which is stored in a
  * file-local buffer.
  */
+
+void simple_transaction_set(struct file *file, size_t n)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+
+	/*
+	 * The barrier ensures that ar->size will really remain zero until
+	 * ar->data is ready for reading.
+	 */
+	smp_mb();
+	ar->size = n;
+}
+
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
 	struct simple_transaction_argresp *ar;
@@ -820,6 +834,7 @@ EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abf83881f68a..1a54ae14a192 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -104,6 +104,16 @@ static void set_grace_period(void)
 	schedule_delayed_work(&grace_period_end, grace_period);
 }
 
+static void restart_grace(void)
+{
+	if (nlmsvc_ops) {
+		cancel_delayed_work_sync(&grace_period_end);
+		locks_end_grace(&lockd_manager);
+		nlmsvc_invalidate_all();
+		set_grace_period();
+	}
+}
+
 /*
  * This is the lockd kernel thread
  */
@@ -149,10 +159,7 @@ lockd(void *vrqstp)
 
 		if (signalled()) {
 			flush_signals(current);
-			if (nlmsvc_ops) {
-				nlmsvc_invalidate_all();
-				set_grace_period();
-			}
+			restart_grace();
 			continue;
 		}
 
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
+			/*
+			 * If this is a blocking request for an
+			 * already pending lock request then we need
+			 * to put it back on lockd's block list
+			 */
+			if (wait)
+				break;
 			ret = nlm_lck_denied;
-			break;
+			goto out;
 		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
-	ret = nlm_lck_denied;
-	if (!wait)
-		goto out;
-
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
diff --git a/fs/namei.c b/fs/namei.c
index b8433ebfae05..967c3db92724 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1130,8 +1130,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
  * @nd: pointer to nameidata
  * @open_flags: open intent flags
  */
-int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags)
+static int path_lookup_open(int dfd, const char *name,
+		unsigned int lookup_flags, struct nameidata *nd, int open_flags)
 {
 	struct file *filp = get_empty_filp();
 	int err;
@@ -1248,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	int err;
 	struct qstr this;
 
+	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
 	err = __lookup_one_len(name, &this, base, len);
 	if (err)
 		return ERR_PTR(err);
@@ -1635,18 +1637,19 @@ static int open_will_write_to_fs(int flag, struct inode *inode)
  * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode)
+		int open_flag, int mode, int acc_mode)
 {
 	struct file *filp;
 	struct nameidata nd;
-	int acc_mode, error;
+	int error;
 	struct path path;
 	struct dentry *dir;
 	int count = 0;
 	int will_write;
 	int flag = open_to_namei_flags(open_flag);
 
-	acc_mode = MAY_OPEN | ACC_MODE(flag);
+	if (!acc_mode)
+		acc_mode = MAY_OPEN | ACC_MODE(flag);
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (flag & O_TRUNC)
@@ -1867,7 +1870,7 @@ do_link:
  */
 struct file *filp_open(const char *filename, int flags, int mode)
 {
-	return do_filp_open(AT_FDCWD, filename, flags, mode);
+	return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
 }
 EXPORT_SYMBOL(filp_open);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index c6f54e4c4290..134d494158d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -695,12 +695,16 @@ static inline void mangle(struct seq_file *m, const char *s)
  */
 int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-	const char *options = mnt->mnt_sb->s_options;
+	const char *options;
+
+	rcu_read_lock();
+	options = rcu_dereference(mnt->mnt_sb->s_options);
 
 	if (options != NULL && options[0]) {
 		seq_putc(m, ',');
 		mangle(m, options);
 	}
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -721,11 +725,22 @@ EXPORT_SYMBOL(generic_show_options);
  */
 void save_mount_options(struct super_block *sb, char *options)
 {
-	kfree(sb->s_options);
-	sb->s_options = kstrdup(options, GFP_KERNEL);
+	BUG_ON(sb->s_options);
+	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
 }
 EXPORT_SYMBOL(save_mount_options);
 
+void replace_mount_options(struct super_block *sb, char *options)
+{
+	char *old = sb->s_options;
+	rcu_assign_pointer(sb->s_options, options);
+	if (old) {
+		synchronize_rcu();
+		kfree(old);
+	}
+}
+EXPORT_SYMBOL(replace_mount_options);
+
 #ifdef CONFIG_PROC_FS
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
@@ -1073,9 +1088,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	 */
 
 	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
-		lock_kernel();
 		sb->s_op->umount_begin(sb);
-		unlock_kernel();
 	}
 
 	/*
@@ -1377,7 +1390,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	if (parent_path) {
 		detach_mnt(source_mnt, parent_path);
 		attach_mnt(source_mnt, path);
-		touch_mnt_namespace(current->nsproxy->mnt_ns);
+		touch_mnt_namespace(parent_path->mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
 		commit_tree(source_mnt);
@@ -1920,8 +1933,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 
-	/* Default to relatime */
-	mnt_flags |= MNT_RELATIME;
+	/* Default to relatime unless overriden */
+	if (!(flags & MS_NOATIME))
+		mnt_flags |= MNT_RELATIME;
 
 	/* Separate the per-mountpoint flags */
 	if (flags & MS_NOSUID)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index f54360f50a9c..fa038df63ac8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -660,13 +660,10 @@ outrel:
 			if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
 				return -ENOMEM;
 			if (user.object_name_len) {
-				newname = kmalloc(user.object_name_len, GFP_USER);
-				if (!newname)
-					return -ENOMEM;
-				if (copy_from_user(newname, user.object_name, user.object_name_len)) {
-					kfree(newname);
-					return -EFAULT;
-				}
+				newname = memdup_user(user.object_name,
+						      user.object_name_len);
+				if (IS_ERR(newname))
+					return PTR_ERR(newname);
 			} else {
 				newname = NULL;
 			}
@@ -760,13 +757,9 @@ outrel:
 			if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
 				return -ENOMEM;
 			if (user.len) {
-				new = kmalloc(user.len, GFP_USER);
-				if (!new)
-					return -ENOMEM;
-				if (copy_from_user(new, user.data, user.len)) {
-					kfree(new);
-					return -EFAULT;
-				}
+				new = memdup_user(user.data, user.len);
+				if (IS_ERR(new))
+					return PTR_ERR(new);
 			} else {
 				new = NULL;
 			}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 370b190a09d1..89f98e9a024b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1943,7 +1943,8 @@ int nfs_permission(struct inode *inode, int mask)
 		case S_IFREG:
 			/* NFSv4 has atomic_open... */
 			if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
-					&& (mask & MAY_OPEN))
+					&& (mask & MAY_OPEN)
+					&& !(mask & MAY_EXEC))
 				goto out;
 			break;
 		case S_IFDIR:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3523b895eb4b..ec7e27d00bc6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -516,13 +516,11 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out_unlock;
 
 	ret = nfs_updatepage(filp, page, 0, pagelen);
-	if (ret == 0)
-		ret = pagelen;
 out_unlock:
+	if (!ret)
+		return VM_FAULT_LOCKED;
 	unlock_page(page);
-	if (ret)
-		ret = VM_FAULT_SIGBUS;
-	return ret;
+	return VM_FAULT_SIGBUS;
 }
 
 static struct vm_operations_struct nfs_file_vm_ops = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e6a1932c7110..35869a4921f1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -713,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
 	if (args->npages != 0)
 		xdr_encode_pages(buf, args->pages, 0, args->len);
 	else
-		req->rq_slen += args->len;
+		req->rq_slen = xdr_adjust_iovec(req->rq_svec,
+				p + XDR_QUADLEN(args->len));
 
 	err = nfsacl_encode(buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82eaadbff408..d2d67781c579 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -683,9 +683,12 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
  */
 static void nfs_umount_begin(struct super_block *sb)
 {
-	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_server *server;
 	struct rpc_clnt *rpc;
 
+	lock_kernel();
+
+	server = NFS_SB(sb);
 	/* -EIO all pending I/O */
 	rpc = server->client_acl;
 	if (!IS_ERR(rpc))
@@ -693,6 +696,8 @@ static void nfs_umount_begin(struct super_block *sb)
 	rpc = server->client;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
+
+	unlock_kernel();
 }
 
 /*
@@ -1228,7 +1233,6 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
-			kfree(string);
 
 			switch (token) {
 			case Opt_xprt_udp:
@@ -1258,6 +1262,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
+			kfree(string);
 
 			switch (token) {
 			case Opt_xprt_udp:
@@ -2106,8 +2111,7 @@ out_err_nosb:
 error_splat_root:
 	dput(mntroot);
 error_splat_super:
-	up_write(&s->s_umount);
-	deactivate_super(s);
+	deactivate_locked_super(s);
 	goto out;
 }
 
@@ -2203,8 +2207,7 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
-	up_write(&s->s_umount);
-	deactivate_super(s);
+	deactivate_locked_super(s);
 	dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
 	return error;
 }
@@ -2464,8 +2467,7 @@ out_free:
 error_splat_root:
 	dput(mntroot);
 error_splat_super:
-	up_write(&s->s_umount);
-	deactivate_super(s);
+	deactivate_locked_super(s);
 	goto out;
 }
 
@@ -2559,8 +2561,7 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
-	up_write(&s->s_umount);
-	deactivate_super(s);
+	deactivate_locked_super(s);
 	dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
 	return error;
 }
@@ -2644,8 +2645,7 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
-	up_write(&s->s_umount);
-	deactivate_super(s);
+	deactivate_locked_super(s);
 	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
 	return error;
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
 config NFSD
 	tristate "NFS server support"
 	depends on INET
+	depends on FILE_LOCKING
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/magic.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 					 struct nfsd3_writeres  *resp)
 {
 	__be32	nfserr;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
 				SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+				   &cnt,
 				   &resp->committed);
-	resp->count = argp->count;
+	resp->count = cnt;
 	RETURN_STATUS(nfserr);
 }
 
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 		struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
 
 		/* Note that we don't care for remote fs's here */
-		if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) {
+		if (sb->s_magic == MSDOS_SUPER_MAGIC) {
 			resp->f_properties = NFS3_FSF_BILLYBOY;
 		}
 		resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 			resp->p_link_max = EXT2_LINK_MAX;
 			resp->p_name_max = EXT2_NAME_LEN;
 			break;
-		case 0x4d44:	/* MSDOS_SUPER_MAGIC */
+		case MSDOS_SUPER_MAGIC:
 			resp->p_case_insensitive = 1;
 			resp->p_case_preserving  = 0;
 			break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
 	__be32 *p;
-	int len = cb_rec->cbr_fhlen;
+	int len = cb_rec->cbr_fh.fh_size;
 
 	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
 	WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
 	WRITE32(cb_rec->cbr_trunc);
 	WRITE32(len);
-	WRITEMEM(cb_rec->cbr_fhval, len);
+	WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
 	return 0;
 }
 
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
 /* Reference counting, callback cleanup, etc., all look racy as heck.
  * And why is cb_set an atomic? */
 
-static int do_probe_callback(void *data)
+static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
 {
-	struct nfs4_client *clp = data;
 	struct sockaddr_in	addr;
 	struct nfs4_callback    *cb = &clp->cl_callback;
 	struct rpc_timeout	timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
 		.client_name    = clp->cl_principal,
 	};
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-		.rpc_argp       = clp,
-	};
 	struct rpc_clnt *client;
-	int status;
 
-	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
-		status = nfserr_cb_path_down;
-		goto out_err;
-	}
+	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+		return ERR_PTR(-EINVAL);
 
 	/* Initialize address */
 	memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
 
 	/* Create RPC client */
 	client = rpc_create(&args);
+	if (IS_ERR(client))
+		dprintk("NFSD: couldn't create callback client: %ld\n",
+			PTR_ERR(client));
+	return client;
+
+}
+
+static int do_probe_callback(void *data)
+{
+	struct nfs4_client *clp = data;
+	struct nfs4_callback    *cb = &clp->cl_callback;
+	struct rpc_message msg = {
+		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+		.rpc_argp       = clp,
+	};
+	struct rpc_clnt *client;
+	int status;
+
+	client = setup_callback_client(clp);
 	if (IS_ERR(client)) {
-		dprintk("NFSD: couldn't create callback client\n");
 		status = PTR_ERR(client);
+		dprintk("NFSD: couldn't create callback client: %d\n",
+								status);
 		goto out_err;
 	}
 
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
 out_release_client:
 	rpc_shutdown_client(client);
 out_err:
-	dprintk("NFSD: warning: no callback path to client %.*s\n",
-		(int)clp->cl_name.len, clp->cl_name.data);
+	dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+		(int)clp->cl_name.len, clp->cl_name.data, status);
 	put_nfs4_client(clp);
-	return status;
+	return 0;
 }
 
 /*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 
 /*
  * called with dp->dl_count inc'ed.
- * nfs4_lock_state() may or may not have been called.
  */
 void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	open->op_truncate = 0;
 
 	if (open->op_create) {
+		/* FIXME: check session persistence and pnfs flags.
+		 * The nfsv4.1 spec requires the following semantics:
+		 *
+		 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+		 * Reply Cache  | server |                 |
+		 * -------------+--------+-----------------+--------------------
+		 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 *              |        |                 | (SHOULD)
+		 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+		 *              |        |                 | (SHOULD NOT)
+		 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 * yes          | no     | GUARDED4        | GUARDED4
+		 * yes          | yes    | GUARDED4        | GUARDED4
+		 */
+
 		/*
 		 * Note: create modes (UNCHECKED,GUARDED...) are the same
 		 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 					(u32 *)open->op_verf.data,
 					&open->op_truncate, &created);
 
-		/* If we ever decide to use different attrs to store the
-		 * verifier in nfsd_create_v3, then we'll need to change this
+		/*
+		 * Following rfc 3530 14.2.16, use the returned bitmask
+		 * to indicate which attributes we used to store the
+		 * verifier:
 		 */
 		if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
-			open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+			open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
 						FATTR4_WORD1_TIME_MODIFY);
 	} else {
 		status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 		goto out;
 
 	set_change_info(&open->op_cinfo, current_fh);
-
-	/* set reply cache */
 	fh_dup2(current_fh, &resfh);
-	open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-			&resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
 
+	/* set reply cache */
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&resfh.fh_handle);
 	if (!created)
 		status = do_open_permission(rqstp, current_fh, open,
 					    NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
 	/* set replay cache */
-	open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-		&current_fh->fh_handle.fh_base,
-		current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&current_fh->fh_handle);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	return status;
 }
 
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+	struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)session->se_sessionid.data;
+
+	clid->cl_boot = sid->clientid.cl_boot;
+	clid->cl_id = sid->clientid.cl_id;
+}
 
 static __be32
 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   struct nfsd4_open *open)
 {
 	__be32 status;
+	struct nfsd4_compoundres *resp;
+
 	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
 		open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
 		return nfserr_inval;
 
+	if (nfsd4_has_session(cstate))
+		copy_clientid(&open->op_clientid, cstate->session);
+
 	nfs4_lock_state();
 
 	/* check seqid for replay. set nfs4_owner */
-	status = nfsd4_process_open1(open);
+	resp = rqstp->rq_resp;
+	status = nfsd4_process_open1(&resp->cstate, open);
 	if (status == nfserr_replay_me) {
 		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
 		fh_put(&cstate->current_fh);
-		cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
-		memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
-				rp->rp_openfh_len);
+		fh_copy_shallow(&cstate->current_fh.fh_handle,
+				&rp->rp_openfh);
 		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 		if (status)
 			dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-			status = nfserr_inval;
-			if (open->op_create)
-				goto out;
-			/* fall through */
 		case NFS4_OPEN_CLAIM_NULL:
 			/*
 			 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	getattr->ga_fhp = &cstate->current_fh;
 	return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check stateid */
-	if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-				&read->rd_stateid,
-				CHECK_FH | RD_STATE, &read->rd_filp))) {
+	if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
+						 RD_STATE, &read->rd_filp))) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
 	}
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
 	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		nfs4_lock_state();
-		status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-			&setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+		status = nfs4_preprocess_stateid_op(cstate,
+			&setattr->sa_stateid, WR_STATE, NULL);
 		nfs4_unlock_state();
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file *filp = NULL;
 	u32 *p;
 	__be32 status = nfs_ok;
+	unsigned long cnt;
 
 	/* no need to check permission - this will be done in nfsd_write() */
 
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
-					CHECK_FH | WR_STATE, &filp);
+	status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
 	if (filp)
 		get_file(filp);
 	nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 	}
 
-	write->wr_bytes_written = write->wr_buflen;
+	cnt = write->wr_buflen;
 	write->wr_how_written = write->wr_stable_how;
 	p = (u32 *)write->wr_verifier.data;
 	*p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
 			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-			     write->wr_buflen, &write->wr_how_written);
+			     &cnt, &write->wr_how_written);
 	if (filp)
 		fput(filp);
 
+	write->wr_bytes_written = cnt;
+
 	if (status == nfserr_symlink)
 		status = nfserr_inval;
 	return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		return status;
 
-	if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
-	    || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
+	    || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+	    || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
 		return nfserr_attrnotsupp;
 	if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
 	    || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out_kfree;
 
-	p = buf + 3;
+	/* skip bitmap */
+	p = buf + 1 + ntohl(buf[0]);
 	status = nfserr_not_same;
 	if (ntohl(*p++) != verify->ve_attrlen)
 		goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 		nfsdstats.nfs4_opcount[opnum]++;
 }
 
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
-	if (cstate == NULL)
-		return;
-	fh_put(&cstate->current_fh);
-	fh_put(&cstate->save_fh);
-	BUG_ON(cstate->replay_owner);
-	kfree(cstate);
-}
-
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
-	struct nfsd4_compound_state *cstate;
-
-	cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
-	if (cstate == NULL)
-		return NULL;
-	fh_init(&cstate->current_fh, NFS4_FHSIZE);
-	fh_init(&cstate->save_fh, NFS4_FHSIZE);
-	cstate->replay_owner = NULL;
-	return cstate;
-}
-
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
 			      void *);
+enum nfsd4_op_flags {
+	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
+	ALLOWED_ON_ABSENT_FS = 2 << 0,	/* ops processed on absent fs */
+	ALLOWED_AS_FIRST_OP = 3 << 0,	/* ops reqired first in compound */
+};
 
 struct nfsd4_operation {
 	nfsd4op_func op_func;
 	u32 op_flags;
-/* Most ops require a valid current filehandle; a few don't: */
-#define ALLOWED_WITHOUT_FH 1
-/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
-#define ALLOWED_ON_ABSENT_FS 2
 	char *op_name;
 };
 
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 
 /*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+			 struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op *op;
+
+	dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+		resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+
+	/* Encode the replayed sequence operation */
+	BUG_ON(resp->opcnt != 1);
+	op = &args->ops[resp->opcnt - 1];
+	nfsd4_encode_operation(resp, op);
+
+	/*return nfserr_retry_uncached_rep in next operation. */
+	if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+		op = &args->ops[resp->opcnt++];
+		op->status = nfserr_retry_uncached_rep;
+		nfsd4_encode_operation(resp, op);
+	}
+	return op->status;
+}
+
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules.
+ *
+ * TODO:
+ * - enforce NFS4ERR_NOT_ONLY_OP,
+ * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ */
+static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+{
+	if (args->minorversion && args->opcnt > 0) {
+		struct nfsd4_op *op = &args->ops[0];
+		return (op->status == nfserr_op_illegal) ||
+		       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+	}
+	return true;
+}
+
+/*
  * COMPOUND call.
  */
 static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 {
 	struct nfsd4_op	*op;
 	struct nfsd4_operation *opdesc;
-	struct nfsd4_compound_state *cstate = NULL;
+	struct nfsd4_compound_state *cstate = &resp->cstate;
 	int		slack_bytes;
 	__be32		status;
 
 	resp->xbuf = &rqstp->rq_res;
-	resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+	resp->p = rqstp->rq_res.head[0].iov_base +
+						rqstp->rq_res.head[0].iov_len;
 	resp->tagp = resp->p;
 	/* reserve space for: taglen, tag, and opcnt */
 	resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->tag = args->tag;
 	resp->opcnt = 0;
 	resp->rqstp = rqstp;
+	resp->cstate.minorversion = args->minorversion;
+	resp->cstate.replay_owner = NULL;
+	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+	/* Use the deferral mechanism only for NFSv4.0 compounds */
+	rqstp->rq_usedeferral = (args->minorversion == 0);
 
 	/*
 	 * According to RFC3010, this takes precedence over all other errors.
 	 */
 	status = nfserr_minor_vers_mismatch;
-	if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+	if (args->minorversion > nfsd_supported_minorversion)
 		goto out;
 
-	status = nfserr_resource;
-	cstate = cstate_alloc();
-	if (cstate == NULL)
-		goto out;
+	if (!nfs41_op_ordering_ok(args)) {
+		op = &args->ops[0];
+		op->status = nfserr_sequence_pos;
+		goto encode_op;
+	}
 
 	status = nfs_ok;
 	while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
 			resp->opcnt, args->opcnt, op->opnum,
 			nfsd4_op_name(op->opnum));
-
 		/*
 		 * The XDR decode routines may have pre-set op->status;
 		 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			BUG_ON(op->status == nfs_ok);
 
 encode_op:
+		/* Only from SEQUENCE or CREATE_SESSION */
+		if (resp->cstate.status == nfserr_replay_cache) {
+			dprintk("%s NFS4.1 replay from cache\n", __func__);
+			if (nfsd4_not_cached(resp))
+				status = nfsd4_enc_uncached_replay(args, resp);
+			else
+				status = op->status;
+			goto out;
+		}
 		if (op->status == nfserr_replay_me) {
 			op->replay = &cstate->replay_owner->so_replay;
 			nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
 
 		nfsd4_increment_op_stats(op->opnum);
 	}
+	if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+		dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+		status = nfserr_jukebox;
+	}
 
-	cstate_free(cstate);
+	resp->cstate.status = status;
+	fh_put(&resp->cstate.current_fh);
+	fh_put(&resp->cstate.save_fh);
+	BUG_ON(resp->cstate.replay_owner);
 out:
 	nfsd4_release_compoundargs(args);
+	/* Reset deferral mechanism for RPC deferrals */
+	rqstp->rq_usedeferral = 1;
 	dprintk("nfsv4 compound returned %d\n", ntohl(status));
 	return status;
 }
 
-static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
+static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
 		.op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 		.op_name = "OP_PUTFH",
 	},
 	[OP_PUTPUBFH] = {
-		/* unsupported, just for future reference: */
+		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
 		.op_name = "OP_PUTPUBFH",
 	},
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
 		.op_name = "OP_RELEASE_LOCKOWNER",
 	},
+
+	/* NFSv4.1 operations */
+	[OP_EXCHANGE_ID] = {
+		.op_func = (nfsd4op_func)nfsd4_exchange_id,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_EXCHANGE_ID",
+	},
+	[OP_CREATE_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_create_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_CREATE_SESSION",
+	},
+	[OP_DESTROY_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_destroy_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_DESTROY_SESSION",
+	},
+	[OP_SEQUENCE] = {
+		.op_func = (nfsd4op_func)nfsd4_sequence,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_SEQUENCE",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..b5348405046b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
 
 typedef int (recdir_func)(struct dentry *, struct dentry *);
 
-struct dentry_list {
-	struct dentry *dentry;
+struct name_list {
+	char name[HEXDIR_LEN];
 	struct list_head list;
 };
 
-struct dentry_list_arg {
-	struct list_head dentries;
-	struct dentry *parent;
-};
-
 static int
-nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
 		loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct dentry_list_arg *dla = arg;
-	struct list_head *dentries = &dla->dentries;
-	struct dentry *parent = dla->parent;
-	struct dentry *dentry;
-	struct dentry_list *child;
+	struct list_head *names = arg;
+	struct name_list *entry;
 
-	if (name && isdotent(name, namlen))
+	if (namlen != HEXDIR_LEN - 1)
 		return 0;
-	dentry = lookup_one_len(name, parent, namlen);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-	child = kmalloc(sizeof(*child), GFP_KERNEL);
-	if (child == NULL)
+	entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
+	if (entry == NULL)
 		return -ENOMEM;
-	child->dentry = dentry;
-	list_add(&child->list, dentries);
+	memcpy(entry->name, name, HEXDIR_LEN - 1);
+	entry->name[HEXDIR_LEN - 1] = '\0';
+	list_add(&entry->list, names);
 	return 0;
 }
 
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
 	const struct cred *original_cred;
 	struct file *filp;
-	struct dentry_list_arg dla = {
-		.parent = dir,
-	};
-	struct list_head *dentries = &dla.dentries;
-	struct dentry_list *child;
+	LIST_HEAD(names);
+	struct name_list *entry;
+	struct dentry *dentry;
 	int status;
 
 	if (!rec_dir_init)
@@ -233,67 +221,42 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
-	INIT_LIST_HEAD(dentries);
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
 			   current_cred());
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
-	INIT_LIST_HEAD(dentries);
-	status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
+	status = vfs_readdir(filp, nfsd4_build_namelist, &names);
 	fput(filp);
-	while (!list_empty(dentries)) {
-		child = list_entry(dentries->next, struct dentry_list, list);
-		status = f(dir, child->dentry);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+
+		dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+		if (IS_ERR(dentry)) {
+			status = PTR_ERR(dentry);
+			break;
+		}
+		status = f(dir, dentry);
+		dput(dentry);
 		if (status)
-			goto out;
-		list_del(&child->list);
-		dput(child->dentry);
-		kfree(child);
+			break;
+		list_del(&entry->list);
+		kfree(entry);
 	}
+	mutex_unlock(&dir->d_inode->i_mutex);
 out:
-	while (!list_empty(dentries)) {
-		child = list_entry(dentries->next, struct dentry_list, list);
-		list_del(&child->list);
-		dput(child->dentry);
-		kfree(child);
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+		list_del(&entry->list);
+		kfree(entry);
 	}
 	nfs4_reset_creds(original_cred);
 	return status;
 }
 
 static int
-nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
-{
-	int status;
-
-	if (!S_ISREG(dir->d_inode->i_mode)) {
-		printk("nfsd4: non-file found in client recovery directory\n");
-		return -EINVAL;
-	}
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	status = vfs_unlink(dir->d_inode, dentry);
-	mutex_unlock(&dir->d_inode->i_mutex);
-	return status;
-}
-
-static int
-nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
-{
-	int status;
-
-	/* For now this directory should already be empty, but we empty it of
-	 * any regular files anyway, just in case the directory was created by
-	 * a kernel from the future.... */
-	nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
-	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	status = vfs_rmdir(dir->d_inode, dentry);
-	mutex_unlock(&dir->d_inode->i_mutex);
-	return status;
-}
-
-static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
 	struct dentry *dentry;
@@ -301,20 +264,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
-	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+	mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
-	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
-		return status;
+		goto out_unlock;
 	}
 	status = -ENOENT;
 	if (!dentry->d_inode)
 		goto out;
-
-	status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
+	status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
 out:
 	dput(dentry);
+out_unlock:
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	return status;
 }
 
@@ -353,10 +316,11 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
 	int status;
 
-	if (nfs4_has_reclaimed_state(child->d_name.name))
+	/* note: we currently use this path only for minorversion 0 */
+	if (nfs4_has_reclaimed_state(child->d_name.name, false))
 		return 0;
 
-	status = nfsd4_clear_clid_dir(parent, child);
+	status = vfs_rmdir(parent->d_inode, child);
 	if (status)
 		printk("failed to remove client recovery directory %s\n",
 				child->d_name.name);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..3b711f5147a7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
 static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
 
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid;              /* bits all 1 */
 /* forward declarations */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static void nfs4_set_recdir(char *recdir);
 
-/* Locking:
- *
- * client_mutex:
- * 	protects clientid_hashtbl[], clientstr_hashtbl[],
- * 	unconfstr_hashtbl[], uncofid_hashtbl[].
- */
+/* Locking: */
+
+/* Currently used for almost all code touching nfsv4 state: */
 static DEFINE_MUTEX(client_mutex);
 
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
+
 static struct kmem_cache *stateowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
 	return x;
 }
 
-/* forward declarations */
-static void release_stateowner(struct nfs4_stateowner *sop);
-static void release_stateid(struct nfs4_stateid *stp, int flags);
-
-/*
- * Delegation state
- */
-
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
-static void
-free_nfs4_file(struct kref *kref)
-{
-	struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
-	list_del(&fp->fi_hash);
-	iput(fp->fi_inode);
-	kmem_cache_free(file_slab, fp);
-}
-
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
-	kref_put(&fi->fi_ref, free_nfs4_file);
+	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+		list_del(&fi->fi_hash);
+		spin_unlock(&recall_lock);
+		iput(fi->fi_inode);
+		kmem_cache_free(file_slab, fi);
+	}
 }
 
 static inline void
 get_nfs4_file(struct nfs4_file *fi)
 {
-	kref_get(&fi->fi_ref);
+	atomic_inc(&fi->fi_ref);
 }
 
 static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
 	dp->dl_stateid.si_generation = 0;
-	dp->dl_fhlen = current_fh->fh_handle.fh_size;
-	memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
-		        current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,290 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
 
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+	list_del(&stp->st_hash);
+	list_del(&stp->st_perfile);
+	list_del(&stp->st_perstateowner);
+}
+
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+	put_nfs4_file(stp->st_file);
+	kmem_cache_free(stateid_slab, stp);
+}
+
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+	unhash_generic_stateid(stp);
+	locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+	free_generic_stateid(stp);
+}
+
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perstateid);
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_lock_stateid(stp);
+	}
+}
+
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+	unhash_lockowner(sop);
+	nfs4_put_stateowner(sop);
+}
+
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+	struct nfs4_stateowner *lock_sop;
+
+	while (!list_empty(&open_stp->st_lockowners)) {
+		lock_sop = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_stateowner, so_perstateid);
+		/* list_del(&open_stp->st_lockowners);  */
+		BUG_ON(lock_sop->so_is_open_owner);
+		release_lockowner(lock_sop);
+	}
+}
+
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+	unhash_generic_stateid(stp);
+	release_stateid_lockowners(stp);
+	nfsd_close(stp->st_vfs_file);
+	free_generic_stateid(stp);
+}
+
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perclient);
+	list_del(&sop->so_perstateid); /* XXX: necessary? */
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_open_stateid(stp);
+	}
+}
+
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+	unhash_openowner(sop);
+	list_del(&sop->so_close_lru);
+	nfs4_put_stateowner(sop);
+}
+
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE	512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+	return sid->sequence % SESSION_HASH_SIZE;
+}
+
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+	u32 *ptr = (u32 *)(&sessionid->data[0]);
+	dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_sessionid *sid;
+
+	sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+	sid->clientid = clp->cl_clientid;
+	sid->sequence = current_sessionid++;
+	sid->reserved = 0;
+}
+
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+
+	spin_lock(&nfsd_serv->sv_lock);
+	if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+		np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+	nfsd_serv->sv_drc_pages_used += np;
+	spin_unlock(&nfsd_serv->sv_lock);
+
+	if (np <= 0) {
+		status = nfserr_resource;
+		fchan->maxreqs = 0;
+	} else
+		fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+
+	return status;
+}
+
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+				    struct nfsd4_session *session,
+				    struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0;
+	__u32   maxcount = svc_max_payload(rqstp);
+
+	/* headerpadsz set to zero in encode routine */
+
+	/* Use the client's max request and max response size if possible */
+	if (fchan->maxreq_sz > maxcount)
+		fchan->maxreq_sz = maxcount;
+	session->se_fmaxreq_sz = fchan->maxreq_sz;
+
+	if (fchan->maxresp_sz > maxcount)
+		fchan->maxresp_sz = maxcount;
+	session->se_fmaxresp_sz = fchan->maxresp_sz;
+
+	/* Set the max response cached size our default which is
+	 * a multiple of PAGE_SIZE and small */
+	session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+	fchan->maxresp_cached = session->se_fmaxresp_cached;
+
+	/* Use the client's maxops if possible */
+	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+	session->se_fmaxops = fchan->maxops;
+
+	/* try to use the client requested number of slots */
+	if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+		fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+
+	/* FIXME: Error means no more DRC pages so the server should
+	 * recover pages from existing sessions. For now fail session
+	 * creation.
+	 */
+	status = set_forechannel_maxreqs(fchan);
+
+	session->se_fnumslots = fchan->maxreqs;
+	return status;
+}
+
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+		   struct nfsd4_create_session *cses)
+{
+	struct nfsd4_session *new, tmp;
+	int idx, status = nfserr_resource, slotsize;
+
+	memset(&tmp, 0, sizeof(tmp));
+
+	/* FIXME: For now, we just accept the client back channel attributes. */
+	status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+	if (status)
+		goto out;
+
+	/* allocate struct nfsd4_session and slot table in one piece */
+	slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+	if (!new)
+		goto out;
+
+	memcpy(new, &tmp, sizeof(*new));
+
+	new->se_client = clp;
+	gen_sessionid(new);
+	idx = hash_sessionid(&new->se_sessionid);
+	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+
+	new->se_flags = cses->flags;
+	kref_init(&new->se_ref);
+	spin_lock(&sessionid_lock);
+	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&sessionid_lock);
+
+	status = nfs_ok;
+out:
+	return status;
+}
+
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_session *elem;
+	int idx;
+
+	dump_sessionid(__func__, sessionid);
+	idx = hash_sessionid(sessionid);
+	dprintk("%s: idx is %d\n", __func__, idx);
+	/* Search in the appropriate list */
+	list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+		dump_sessionid("list traversal", &elem->se_sessionid);
+		if (!memcmp(elem->se_sessionid.data, sessionid->data,
+			    NFS4_MAX_SESSIONID_LEN)) {
+			return elem;
+		}
+	}
+
+	dprintk("%s: session not found\n", __func__);
+	return NULL;
+}
+
+/* caller must hold sessionid_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+	list_del(&ses->se_hash);
+	list_del(&ses->se_perclnt);
+}
+
+static void
+release_session(struct nfsd4_session *ses)
+{
+	spin_lock(&sessionid_lock);
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
+	nfsd4_put_session(ses);
+}
+
+static void nfsd4_release_respages(struct page **respages, short resused);
+
+void
+free_session(struct kref *kref)
+{
+	struct nfsd4_session *ses;
+	int i;
+
+	ses = container_of(kref, struct nfsd4_session, se_ref);
+	for (i = 0; i < ses->se_fnumslots; i++) {
+		struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
+		nfsd4_release_respages(e->ce_respages, e->ce_resused);
+	}
+	kfree(ses);
+}
+
 static inline void
 renew_client(struct nfs4_client *clp)
 {
@@ -330,8 +602,8 @@ STALE_CLIENTID(clientid_t *clid)
 {
 	if (clid->cl_boot == boot_time)
 		return 0;
-	dprintk("NFSD stale clientid (%08x/%08x)\n", 
-			clid->cl_boot, clid->cl_id);
+	dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
+		clid->cl_boot, clid->cl_id, boot_time);
 	return 1;
 }
 
@@ -376,6 +648,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
 	shutdown_callback_client(clp);
+	nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+			     clp->cl_slot.sl_cache_entry.ce_resused);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -420,7 +694,13 @@ expire_client(struct nfs4_client *clp)
 	list_del(&clp->cl_lru);
 	while (!list_empty(&clp->cl_openowners)) {
 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-		release_stateowner(sop);
+		release_openowner(sop);
+	}
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session  *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				 se_perclnt);
+		release_session(ses);
 	}
 	put_nfs4_client(clp);
 }
@@ -439,6 +719,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	return clp;
 }
@@ -568,25 +849,45 @@ find_unconfirmed_client(clientid_t *clid)
 	return NULL;
 }
 
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+	bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+	return use_exchange_id == has_exchange_flags;
+}
+
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+			     bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
 }
 
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+			       bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
@@ -685,6 +986,534 @@ out_err:
 	return;
 }
 
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+
+	resp->cstate.statp = statp;
+}
+
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+	int i;
+
+	dprintk("--> %s\n", __func__);
+	for (i = 0; i < resused; i++) {
+		if (!respages[i])
+			continue;
+		put_page(respages[i]);
+		respages[i] = NULL;
+	}
+}
+
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		topages[i] = frompages[i];
+		if (!topages[i])
+			continue;
+		get_page(topages[i]);
+	}
+}
+
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+	struct nfsd4_op *op = &args->ops[resp->opcnt];
+	struct kvec *resv = &rqstp->rq_res.head[0];
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+	/* Don't cache a failed OP_SEQUENCE. */
+	if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+		return;
+
+	nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+	entry->ce_opcnt = resp->opcnt;
+	entry->ce_status = resp->cstate.status;
+
+	/*
+	 * Don't need a page to cache just the sequence operation - the slot
+	 * does this for us!
+	 */
+
+	if (nfsd4_not_cached(resp)) {
+		entry->ce_resused = 0;
+		entry->ce_rpchdrlen = 0;
+		dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+			resp->cstate.slot->sl_cache_entry.ce_cachethis);
+		return;
+	}
+	entry->ce_resused = rqstp->rq_resused;
+	if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+		entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+	nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+			 entry->ce_resused);
+	entry->ce_datav.iov_base = resp->cstate.statp;
+	entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]));
+	/* Current request rpc header length*/
+	entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]);
+}
+
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+			struct nfsd4_cache_entry *entry)
+{
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct kvec *resv = &resp->rqstp->rq_res.head[0];
+	int len;
+
+	/* Current request rpc header length*/
+	len = (char *)resp->cstate.statp -
+			(char *)page_address(rqstp->rq_respages[0]);
+	if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+		dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+			entry->ce_datav.iov_len);
+		return 0;
+	}
+	/* copy the cached reply nfsd data past the current rpc header */
+	memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+		entry->ce_datav.iov_len);
+	resv->iov_len = len + entry->ce_datav.iov_len;
+	return 1;
+}
+
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page.  Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+			 struct nfsd4_sequence *seq)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	__be32 status;
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+	/*
+	 * If this is just the sequence operation, we did not keep
+	 * a page in the cache entry because we can just use the
+	 * slot info stored in struct nfsd4_sequence that was checked
+	 * against the slot in nfsd4_sequence().
+	 *
+	 * This occurs when seq->cachethis is FALSE, or when the client
+	 * session inactivity timer fires and a solo sequence operation
+	 * is sent (lease renewal).
+	 */
+	if (seq && nfsd4_not_cached(resp)) {
+		seq->maxslots = resp->cstate.session->se_fnumslots;
+		return nfs_ok;
+	}
+
+	if (!nfsd41_copy_replay_data(resp, entry)) {
+		/*
+		 * Not enough room to use the replay rpc header, send the
+		 * cached header. Release all the allocated result pages.
+		 */
+		svc_free_res_pages(resp->rqstp);
+		nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+			entry->ce_resused);
+	} else {
+		/* Release all but the first allocated result page */
+
+		resp->rqstp->rq_resused--;
+		svc_free_res_pages(resp->rqstp);
+
+		nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+				 &entry->ce_respages[1],
+				 entry->ce_resused - 1);
+	}
+
+	resp->rqstp->rq_resused = entry->ce_resused;
+	resp->opcnt = entry->ce_opcnt;
+	resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
+	status = entry->ce_status;
+
+	return status;
+}
+
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+	/* pNFS is not supported */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+
+	/* Referrals are supported, Migration is not. */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+	/* set the wire flags to return to client. */
+	clid->flags = new->cl_exchange_flags;
+}
+
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+		  struct nfsd4_compound_state *cstate,
+		  struct nfsd4_exchange_id *exid)
+{
+	struct nfs4_client *unconf, *conf, *new;
+	int status;
+	unsigned int		strhashval;
+	char			dname[HEXDIR_LEN];
+	nfs4_verifier		verf = exid->verifier;
+	u32			ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+
+	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+		" ip_addr=%u flags %x, spa_how %d\n",
+		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
+		ip_addr, exid->flags, exid->spa_how);
+
+	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+		return nfserr_inval;
+
+	/* Currently only support SP4_NONE */
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_SSV:
+		return nfserr_encr_alg_unsupp;
+	default:
+		BUG();				/* checked by xdr code */
+	case SP4_MACH_CRED:
+		return nfserr_serverfault;	/* no excuse :-/ */
+	}
+
+	status = nfs4_make_rec_clidname(dname, &exid->clname);
+
+	if (status)
+		goto error;
+
+	strhashval = clientstr_hashval(dname);
+
+	nfs4_lock_state();
+	status = nfs_ok;
+
+	conf = find_confirmed_client_by_str(dname, strhashval, true);
+	if (conf) {
+		if (!same_verf(&verf, &conf->cl_verifier)) {
+			/* 18.35.4 case 8 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_not_same;
+				goto out;
+			}
+			/* Client reboot: destroy old state */
+			expire_client(conf);
+			goto out_new;
+		}
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			/* 18.35.4 case 9 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_perm;
+				goto out;
+			}
+			expire_client(conf);
+			goto out_new;
+		}
+		if (ip_addr != conf->cl_addr &&
+		    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+			/* Client collision. 18.35.4 case 3 */
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+		/*
+		 * Set bit when the owner id and verifier map to an already
+		 * confirmed client id (18.35.3).
+		 */
+		exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+
+		/*
+		 * Falling into 18.35.4 case 2, possible router replay.
+		 * Leave confirmed record intact and return same result.
+		 */
+		copy_verf(conf, &verf);
+		new = conf;
+		goto out_copy;
+	} else {
+		/* 18.35.4 case 7 */
+		if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+			status = nfserr_noent;
+			goto out;
+		}
+	}
+
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+	if (unconf) {
+		/*
+		 * Possible retry or client restart.  Per 18.35.4 case 4,
+		 * a new unconfirmed record should be generated regardless
+		 * of whether any properties have changed.
+		 */
+		expire_client(unconf);
+	}
+
+out_new:
+	/* Normal case */
+	new = create_client(exid->clname, dname);
+	if (new == NULL) {
+		status = nfserr_resource;
+		goto out;
+	}
+
+	copy_verf(new, &verf);
+	copy_cred(&new->cl_cred, &rqstp->rq_cred);
+	new->cl_addr = ip_addr;
+	gen_clid(new);
+	gen_confirm(new);
+	add_to_unconfirmed(new, strhashval);
+out_copy:
+	exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+	exid->clientid.cl_id = new->cl_clientid.cl_id;
+
+	new->cl_slot.sl_seqid = 0;
+	exid->seqid = 1;
+	nfsd4_set_ex_flags(new, exid);
+
+	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+		new->cl_slot.sl_seqid, new->cl_exchange_flags);
+	status = nfs_ok;
+
+out:
+	nfs4_unlock_state();
+error:
+	dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+	return status;
+}
+
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+	dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+		slot->sl_seqid);
+
+	/* The slot is in use, and no response has been sent. */
+	if (slot->sl_inuse) {
+		if (seqid == slot->sl_seqid)
+			return nfserr_jukebox;
+		else
+			return nfserr_seq_misordered;
+	}
+	/* Normal */
+	if (likely(seqid == slot->sl_seqid + 1))
+		return nfs_ok;
+	/* Replay */
+	if (seqid == slot->sl_seqid)
+		return nfserr_replay_cache;
+	/* Wraparound */
+	if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+		return nfs_ok;
+	/* Misordered replay or misordered new request */
+	return nfserr_seq_misordered;
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_create_session *cr_ses)
+{
+	u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfs4_client *conf, *unconf;
+	struct nfsd4_slot *slot = NULL;
+	int status = 0;
+
+	nfs4_lock_state();
+	unconf = find_unconfirmed_client(&cr_ses->clientid);
+	conf = find_confirmed_client(&cr_ses->clientid);
+
+	if (conf) {
+		slot = &conf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status == nfserr_replay_cache) {
+			dprintk("Got a create_session replay! seqid= %d\n",
+				slot->sl_seqid);
+			cstate->slot = slot;
+			cstate->status = status;
+			/* Return the cached reply status */
+			status = nfsd4_replay_cache_entry(resp, NULL);
+			goto out;
+		} else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+			status = nfserr_seq_misordered;
+			dprintk("Sequence misordered!\n");
+			dprintk("Expected seqid= %d but got seqid= %d\n",
+				slot->sl_seqid, cr_ses->seqid);
+			goto out;
+		}
+		conf->cl_slot.sl_seqid++;
+	} else if (unconf) {
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+		    (ip_addr != unconf->cl_addr)) {
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+
+		slot = &unconf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status) {
+			/* an unconfirmed replay returns misordered */
+			status = nfserr_seq_misordered;
+			goto out;
+		}
+
+		slot->sl_seqid++; /* from 0 to 1 */
+		move_to_confirmed(unconf);
+
+		/*
+		 * We do not support RDMA or persistent sessions
+		 */
+		cr_ses->flags &= ~SESSION4_PERSIST;
+		cr_ses->flags &= ~SESSION4_RDMA;
+
+		conf = unconf;
+	} else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+
+	status = alloc_init_session(rqstp, conf, cr_ses);
+	if (status)
+		goto out;
+
+	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+	cr_ses->seqid = slot->sl_seqid;
+
+	slot->sl_inuse = true;
+	cstate->slot = slot;
+	/* Ensure a page is used for the cache */
+	slot->sl_cache_entry.ce_cachethis = 1;
+out:
+	nfs4_unlock_state();
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+		      struct nfsd4_compound_state *cstate,
+		      struct nfsd4_destroy_session *sessionid)
+{
+	struct nfsd4_session *ses;
+	u32 status = nfserr_badsession;
+
+	/* Notes:
+	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+	 * - Should we return nfserr_back_chan_busy if waiting for
+	 *   callbacks on to-be-destroyed session?
+	 * - Do we need to clear any callback info from previous session?
+	 */
+
+	dump_sessionid(__func__, &sessionid->sessionid);
+	spin_lock(&sessionid_lock);
+	ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+	if (!ses) {
+		spin_unlock(&sessionid_lock);
+		goto out;
+	}
+
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
+
+	/* wait for callbacks */
+	shutdown_callback_client(ses->se_client);
+	nfsd4_put_session(ses);
+	status = nfs_ok;
+out:
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+	       struct nfsd4_compound_state *cstate,
+	       struct nfsd4_sequence *seq)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_session *session;
+	struct nfsd4_slot *slot;
+	int status;
+
+	if (resp->opcnt != 1)
+		return nfserr_sequence_pos;
+
+	spin_lock(&sessionid_lock);
+	status = nfserr_badsession;
+	session = find_in_sessionid_hashtbl(&seq->sessionid);
+	if (!session)
+		goto out;
+
+	status = nfserr_badslot;
+	if (seq->slotid >= session->se_fnumslots)
+		goto out;
+
+	slot = &session->se_slots[seq->slotid];
+	dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+	status = check_slot_seqid(seq->seqid, slot);
+	if (status == nfserr_replay_cache) {
+		cstate->slot = slot;
+		cstate->session = session;
+		/* Return the cached reply status and set cstate->status
+		 * for nfsd4_svc_encode_compoundres processing */
+		status = nfsd4_replay_cache_entry(resp, seq);
+		cstate->status = nfserr_replay_cache;
+		goto replay_cache;
+	}
+	if (status)
+		goto out;
+
+	/* Success! bump slot seqid */
+	slot->sl_inuse = true;
+	slot->sl_seqid = seq->seqid;
+	slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+	/* Always set the cache entry cachethis for solo sequence */
+	if (nfsd4_is_solo_sequence(resp))
+		slot->sl_cache_entry.ce_cachethis = 1;
+
+	cstate->slot = slot;
+	cstate->session = session;
+
+replay_cache:
+	/* Renew the clientid on success and on replay.
+	 * Hold a session reference until done processing the compound:
+	 * nfsd4_put_session called only if the cstate slot is set.
+	 */
+	renew_client(session->se_client);
+	nfsd4_get_session(session);
+out:
+	spin_unlock(&sessionid_lock);
+	dprintk("%s: return %d\n", __func__, ntohl(status));
+	return status;
+}
+
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
@@ -716,14 +1545,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	strhashval = clientstr_hashval(dname);
 
 	nfs4_lock_state();
-	conf = find_confirmed_client_by_str(dname, strhashval);
+	conf = find_confirmed_client_by_str(dname, strhashval, false);
 	if (conf) {
 		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
-		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
-				|| conf->cl_addr != sin->sin_addr.s_addr) {
-			dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
-				&conf->cl_addr);
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			dprintk("NFSD: setclientid: string in use by client"
+				" at %pI4\n", &conf->cl_addr);
 			goto out;
 		}
 	}
@@ -732,7 +1560,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * has a description of SETCLIENTID request processing consisting
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
-	unconf = find_unconfirmed_client_by_str(dname, strhashval);
+	unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
 	status = nfserr_resource;
 	if (!conf) {
 		/*
@@ -887,7 +1715,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			unsigned int hash =
 				clientstr_hashval(unconf->cl_recdir);
 			conf = find_confirmed_client_by_str(unconf->cl_recdir,
-									hash);
+							    hash, false);
 			if (conf) {
 				nfsd4_remove_clid_dir(conf);
 				expire_client(conf);
@@ -923,11 +1751,13 @@ alloc_init_file(struct inode *ino)
 
 	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
 	if (fp) {
-		kref_init(&fp->fi_ref);
+		atomic_set(&fp->fi_ref, 1);
 		INIT_LIST_HEAD(&fp->fi_hash);
 		INIT_LIST_HEAD(&fp->fi_stateids);
 		INIT_LIST_HEAD(&fp->fi_delegations);
+		spin_lock(&recall_lock);
 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+		spin_unlock(&recall_lock);
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
@@ -1037,48 +1867,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	return sop;
 }
 
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-	struct nfs4_stateowner *lock_sop;
-
-	while (!list_empty(&open_stp->st_lockowners)) {
-		lock_sop = list_entry(open_stp->st_lockowners.next,
-				struct nfs4_stateowner, so_perstateid);
-		/* list_del(&open_stp->st_lockowners);  */
-		BUG_ON(lock_sop->so_is_open_owner);
-		release_stateowner(lock_sop);
-	}
-}
-
-static void
-unhash_stateowner(struct nfs4_stateowner *sop)
-{
-	struct nfs4_stateid *stp;
-
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	if (sop->so_is_open_owner)
-		list_del(&sop->so_perclient);
-	list_del(&sop->so_perstateid);
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_entry(sop->so_stateids.next,
-			struct nfs4_stateid, st_perstateowner);
-		if (sop->so_is_open_owner)
-			release_stateid(stp, OPEN_STATE);
-		else
-			release_stateid(stp, LOCK_STATE);
-	}
-}
-
-static void
-release_stateowner(struct nfs4_stateowner *sop)
-{
-	unhash_stateowner(sop);
-	list_del(&sop->so_close_lru);
-	nfs4_put_stateowner(sop);
-}
-
 static inline void
 init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1888,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	stp->st_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
-	__set_bit(open->op_share_access, &stp->st_access_bmap);
+	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+		  &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 	stp->st_openstp = NULL;
 }
 
 static void
-release_stateid(struct nfs4_stateid *stp, int flags)
-{
-	struct file *filp = stp->st_vfs_file;
-
-	list_del(&stp->st_hash);
-	list_del(&stp->st_perfile);
-	list_del(&stp->st_perstateowner);
-	if (flags & OPEN_STATE) {
-		release_stateid_lockowners(stp);
-		stp->st_vfs_file = NULL;
-		nfsd_close(filp);
-	} else if (flags & LOCK_STATE)
-		locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
-	put_nfs4_file(stp->st_file);
-	kmem_cache_free(stateid_slab, stp);
-}
-
-static void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
 	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1931,33 @@ find_file(struct inode *ino)
 	unsigned int hashval = file_hashval(ino);
 	struct nfs4_file *fp;
 
+	spin_lock(&recall_lock);
 	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
 		if (fp->fi_inode == ino) {
 			get_nfs4_file(fp);
+			spin_unlock(&recall_lock);
 			return fp;
 		}
 	}
+	spin_unlock(&recall_lock);
 	return NULL;
 }
 
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
 {
-	if (x < NFS4_SHARE_ACCESS_READ)
+	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
 		return 0;
-	if (x > NFS4_SHARE_ACCESS_BOTH)
+	if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+		return 0;
+	x &= ~NFS4_SHARE_ACCESS_MASK;
+	if (minorversion && x) {
+		if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+			return 0;
+		if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+			return 0;
+		x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+	}
+	if (x)
 		return 0;
 	return 1;
 }
@@ -1409,7 +2193,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 
 
 __be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+		    struct nfsd4_open *open)
 {
 	clientid_t *clientid = &open->op_clientid;
 	struct nfs4_client *clp = NULL;
@@ -1432,10 +2217,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
 			return nfserr_expired;
 		goto renew;
 	}
+	/* When sessions are used, skip open sequenceid processing */
+	if (nfsd4_has_session(cstate))
+		goto renew;
 	if (!sop->so_confirmed) {
 		/* Replace unconfirmed owners without checking for replay. */
 		clp = sop->so_client;
-		release_stateowner(sop);
+		release_openowner(sop);
 		open->op_stateowner = NULL;
 		goto renew;
 	}
@@ -1709,6 +2497,7 @@ out:
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2505,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	__be32 status;
 
 	status = nfserr_inval;
-	if (!access_valid(open->op_share_access)
+	if (!access_valid(open->op_share_access, resp->cstate.minorversion)
 			|| !deny_valid(open->op_share_deny))
 		goto out;
 	/*
@@ -1764,12 +2553,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		init_stateid(stp, fp, open);
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
-			release_stateid(stp, OPEN_STATE);
+			release_open_stateid(stp);
 			goto out;
 		}
+		if (nfsd4_has_session(&resp->cstate))
+			update_stateid(&stp->st_stateid);
 	}
 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
 
+	if (nfsd4_has_session(&resp->cstate))
+		open->op_stateowner->so_confirmed = 1;
+
 	/*
 	* Attempt to hand out a delegation. No error return, because the
 	* OPEN succeeds even if we fail.
@@ -1790,7 +2584,8 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!open->op_stateowner->so_confirmed)
+	if (!open->op_stateowner->so_confirmed &&
+	    !nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
 
 	return status;
@@ -1898,7 +2693,7 @@ nfs4_laundromat(void)
 		}
 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
 			sop->so_id);
-		release_stateowner(sop);
+		release_openowner(sop);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2778,7 @@ out:
 static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
-	/* Trying to call delegreturn with a special stateid? Yuch: */
-	if (!(flags & (RD_STATE | WR_STATE)))
-		return nfserr_bad_stateid;
-	else if (ONE_STATEID(stateid) && (flags & RD_STATE))
+	if (ONE_STATEID(stateid) && (flags & RD_STATE))
 		return nfs_ok;
 	else if (locks_in_grace()) {
 		/* Answer in remaining cases depends on existance of
@@ -2005,14 +2797,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
  * that are not able to provide mandatory locking.
  */
 static inline int
-io_during_grace_disallowed(struct inode *inode, int flags)
+grace_disallows_io(struct inode *inode)
 {
-	return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
-		&& mandatory_lock(inode);
+	return locks_in_grace() && mandatory_lock(inode);
 }
 
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
 {
+	/*
+	 * When sessions are used the stateid generation number is ignored
+	 * when it is zero.
+	 */
+	if ((flags & HAS_SESSION) && in->si_generation == 0)
+		goto out;
+
 	/* If the client sends us a stateid from the future, it's buggy: */
 	if (in->si_generation > ref->si_generation)
 		return nfserr_bad_stateid;
@@ -2028,74 +2826,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
 	 */
 	if (in->si_generation < ref->si_generation)
 		return nfserr_old_stateid;
+out:
 	return nfs_ok;
 }
 
+static int is_delegation_stateid(stateid_t *stateid)
+{
+	return stateid->si_fileid == 0;
+}
+
 /*
 * Checks for stateid operations
 */
 __be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+			   stateid_t *stateid, int flags, struct file **filpp)
 {
 	struct nfs4_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
-	stateid_t *stidp;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	__be32 status;
 
-	dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
-		stateid->si_boot, stateid->si_stateownerid, 
-		stateid->si_fileid, stateid->si_generation); 
 	if (filpp)
 		*filpp = NULL;
 
-	if (io_during_grace_disallowed(ino, flags))
+	if (grace_disallows_io(ino))
 		return nfserr_grace;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return check_special_stateids(current_fh, stateid, flags);
 
-	/* STALE STATEID */
 	status = nfserr_stale_stateid;
 	if (STALE_STATEID(stateid)) 
 		goto out;
 
-	/* BAD STATEID */
 	status = nfserr_bad_stateid;
-	if (!stateid->si_fileid) { /* delegation stateid */
-		if(!(dp = find_delegation_stateid(ino, stateid))) {
-			dprintk("NFSD: delegation stateid not found\n");
+	if (is_delegation_stateid(stateid)) {
+		dp = find_delegation_stateid(ino, stateid);
+		if (!dp)
 			goto out;
-		}
-		stidp = &dp->dl_stateid;
+		status = check_stateid_generation(stateid, &dp->dl_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_delegmode(dp, flags);
+		if (status)
+			goto out;
+		renew_client(dp->dl_client);
+		if (filpp)
+			*filpp = dp->dl_vfs_file;
 	} else { /* open or lock stateid */
-		if (!(stp = find_stateid(stateid, flags))) {
-			dprintk("NFSD: open or lock stateid not found\n");
+		stp = find_stateid(stateid, flags);
+		if (!stp)
 			goto out;
-		}
-		if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
+		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
 			goto out;
-		stidp = &stp->st_stateid;
-	}
-	status = check_stateid_generation(stateid, stidp);
-	if (status)
-		goto out;
-	if (stp) {
-		if ((status = nfs4_check_openmode(stp,flags)))
+		status = check_stateid_generation(stateid, &stp->st_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_openmode(stp, flags);
+		if (status)
 			goto out;
 		renew_client(stp->st_stateowner->so_client);
 		if (filpp)
 			*filpp = stp->st_vfs_file;
-	} else {
-		if ((status = nfs4_check_delegmode(dp, flags)))
-			goto out;
-		renew_client(dp->dl_client);
-		if (flags & DELEG_RET)
-			unhash_delegation(dp);
-		if (filpp)
-			*filpp = dp->dl_vfs_file;
 	}
 	status = nfs_ok;
 out:
@@ -2113,10 +2914,14 @@ setlkflg (int type)
  * Checks for sequence id mutating operations. 
  */
 static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+			 stateid_t *stateid, int flags,
+			 struct nfs4_stateowner **sopp,
+			 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
 	struct nfs4_stateid *stp;
 	struct nfs4_stateowner *sop;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
 
 	dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
@@ -2134,6 +2939,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 
 	if (STALE_STATEID(stateid))
 		return nfserr_stale_stateid;
+
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	/*
 	* We return BAD_STATEID if filehandle doesn't match stateid, 
 	* the confirmed flag is incorrecly set, or the generation 
@@ -2166,8 +2975,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 		if (lock->lk_is_new) {
 			if (!sop->so_is_open_owner)
 				return nfserr_bad_stateid;
-			if (!same_clid(&clp->cl_clientid, lockclid))
-			       return nfserr_bad_stateid;
+			if (!(flags & HAS_SESSION) &&
+			    !same_clid(&clp->cl_clientid, lockclid))
+				return nfserr_bad_stateid;
 			/* stp is the open stateid */
 			status = nfs4_check_openmode(stp, lkflg);
 			if (status)
@@ -2190,7 +3000,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 	*  For the moment, we ignore the possibility of 
 	*  generation number wraparound.
 	*/
-	if (seqid != sop->so_seqid)
+	if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
 		goto check_replay;
 
 	if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3013,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
 	}
-	status = check_stateid_generation(stateid, &stp->st_stateid);
+	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
 	if (status)
 		return status;
 	renew_client(sop->so_client);
@@ -2239,7 +3049,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
 					CONFIRM | OPEN_STATE,
 					&oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3114,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
 
-	if (!access_valid(od->od_share_access)
+	if (!access_valid(od->od_share_access, cstate->minorversion)
 			|| !deny_valid(od->od_share_deny))
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					od->od_seqid,
 					&od->od_stateid, 
 					OPEN_STATE,
@@ -2362,7 +3172,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check close_lru for replay */
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					close->cl_seqid,
 					&close->cl_stateid, 
 					OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3183,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
 
 	/* release_stateid() calls nfsd_close() if needed */
-	release_stateid(stp, OPEN_STATE);
+	release_open_stateid(stp);
 
 	/* place unused nfs4_stateowners on so_close_lru list to be
 	 * released by the laundromat service after the lease period
@@ -2394,16 +3204,40 @@ __be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_delegreturn *dr)
 {
+	struct nfs4_delegation *dp;
+	stateid_t *stateid = &dr->dr_stateid;
+	struct inode *inode;
 	__be32 status;
+	int flags = 0;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-		goto out;
+		return status;
+	inode = cstate->current_fh.fh_dentry->d_inode;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-					    &dr->dr_stateid, DELEG_RET, NULL);
-	nfs4_unlock_state();
+	status = nfserr_bad_stateid;
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		goto out;
+	status = nfserr_stale_stateid;
+	if (STALE_STATEID(stateid))
+		goto out;
+	status = nfserr_bad_stateid;
+	if (!is_delegation_stateid(stateid))
+		goto out;
+	dp = find_delegation_stateid(inode, stateid);
+	if (!dp)
+		goto out;
+	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+	if (status)
+		goto out;
+	renew_client(dp->dl_client);
+
+	unhash_delegation(dp);
 out:
+	nfs4_unlock_state();
+
 	return status;
 }
 
@@ -2684,11 +3518,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfs4_file *fp;
 		
 		status = nfserr_stale_clientid;
-		if (STALE_CLIENTID(&lock->lk_new_clientid))
+		if (!nfsd4_has_session(cstate) &&
+		    STALE_CLIENTID(&lock->lk_new_clientid))
 			goto out;
 
 		/* validate and update open stateid and open seqid */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 					OPEN_STATE,
@@ -2715,7 +3550,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 	} else {
 		/* lock (lock owner + lock stateid) already exists */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       LOCK_STATE,
@@ -2788,7 +3623,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 out:
 	if (status && lock->lk_is_new && lock_sop)
-		release_stateowner(lock_sop);
+		release_lockowner(lock_sop);
 	if (lock->lk_replay_owner) {
 		nfs4_get_stateowner(lock->lk_replay_owner);
 		cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3673,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 
 	status = nfserr_stale_clientid;
-	if (STALE_CLIENTID(&lockt->lt_clientid))
+	if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
 		goto out;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3746,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 									        
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					locku->lu_seqid, 
 					&locku->lu_stateid, 
 					LOCK_STATE,
@@ -3037,7 +3872,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		/* unhash_stateowner deletes so_perclient only
 		 * for openowners. */
 		list_del(&sop->so_perclient);
-		release_stateowner(sop);
+		release_lockowner(sop);
 	}
 out:
 	nfs4_unlock_state();
@@ -3051,12 +3886,12 @@ alloc_reclaim(void)
 }
 
 int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 {
 	unsigned int strhashval = clientstr_hashval(name);
 	struct nfs4_client *clp;
 
-	clp = find_confirmed_client_by_str(name, strhashval);
+	clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
 	return clp ? 1 : 0;
 }
 
@@ -3153,6 +3988,8 @@ nfs4_state_init(void)
 		INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
 		INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
 	}
+	for (i = 0; i < SESSION_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&sessionid_hashtbl[i]);
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&file_hashtbl[i]);
 	}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b73549d293be 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 	return p;
 }
 
+static int zero_clientid(clientid_t *clid)
+{
+	return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
+
 static int
 defer_free(struct nfsd4_compoundargs *argp,
 		void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 
 	bmval[0] = 0;
 	bmval[1] = 0;
+	bmval[2] = 0;
 
 	READ_BUF(4);
 	READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 		READ32(bmval[0]);
 	if (bmlen > 1)
 		READ32(bmval[1]);
+	if (bmlen > 2)
+		READ32(bmval[2]);
 
 	DECODE_TAIL;
 }
 
+static u32 nfsd_attrmask[] = {
+	NFSD_WRITEABLE_ATTRS_WORD0,
+	NFSD_WRITEABLE_ATTRS_WORD1,
+	NFSD_WRITEABLE_ATTRS_WORD2
+};
+
+static u32 nfsd41_ex_attrmask[] = {
+	NFSD_SUPPATTR_EXCLCREAT_WORD0,
+	NFSD_SUPPATTR_EXCLCREAT_WORD1,
+	NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
-    struct nfs4_acl **acl)
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
+		   struct iattr *iattr, struct nfs4_acl **acl)
 {
 	int expected_len, len = 0;
 	u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 	 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
 	 * read-only attributes return ERR_INVAL.
 	 */
-	if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+	    (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+	    (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
 		return nfserr_attrnotsupp;
-	if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1))
+	if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+	    (bmval[2] & ~writable[2]))
 		return nfserr_inval;
 
 	READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 			goto xdr_error;
 		}
 	}
+	BUG_ON(bmval[2]);	/* no such writeable attr supported yet */
 	if (len != expected_len)
 		goto xdr_error;
 
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 	if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
 		return status;
 
-	if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+	status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+				    &create->cr_iattr, &create->cr_acl);
+	if (status)
 		goto out;
 
 	DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 	READ_BUF(lockt->lt_owner.len);
 	READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
 
+	if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
+		return nfserr_inval;
 	DECODE_TAIL;
 }
 
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		switch (open->op_createmode) {
 		case NFS4_CREATE_UNCHECKED:
 		case NFS4_CREATE_GUARDED:
-			if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				nfsd_attrmask, &open->op_iattr, &open->op_acl);
+			if (status)
 				goto out;
 			break;
 		case NFS4_CREATE_EXCLUSIVE:
 			READ_BUF(8);
 			COPYMEM(open->op_verf.data, 8);
 			break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (argp->minorversion < 1)
+				goto xdr_error;
+			READ_BUF(8);
+			COPYMEM(open->op_verf.data, 8);
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				nfsd41_ex_attrmask, &open->op_iattr,
+				&open->op_acl);
+			if (status)
+				goto out;
+			break;
 		default:
 			goto xdr_error;
 		}
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
 	status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
 	if (status)
 		return status;
-	return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+	return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
 				  &setattr->sa_iattr, &setattr->sa_acl);
 }
 
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 	READ_BUF(rlockowner->rl_owner.len);
 	READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
 
+	if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+		return nfserr_inval;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+			 struct nfsd4_exchange_id *exid)
+{
+	int dummy;
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_VERIFIER_SIZE);
+	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+	READ_BUF(4);
+	READ32(exid->clname.len);
+
+	READ_BUF(exid->clname.len);
+	SAVEMEM(exid->clname.data, exid->clname.len);
+
+	READ_BUF(4);
+	READ32(exid->flags);
+
+	/* Ignore state_protect4_a */
+	READ_BUF(4);
+	READ32(exid->spa_how);
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		/* spo_must_enforce */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* spo_must_allow */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+		break;
+	case SP4_SSV:
+		/* ssp_ops */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* ssp_hash_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_encr_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_window and ssp_num_gss_handles */
+		READ_BUF(8);
+		READ32(dummy);
+		READ32(dummy);
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* Ignore Implementation ID */
+	READ_BUF(4);    /* nfs_impl_id4 array length */
+	READ32(dummy);
+
+	if (dummy > 1)
+		goto xdr_error;
+
+	if (dummy == 1) {
+		/* nii_domain */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_name */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_date */
+		READ_BUF(12);
+		p += 3;
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+			    struct nfsd4_create_session *sess)
+{
+	DECODE_HEAD;
+
+	u32 dummy;
+	char *machine_name;
+	int i;
+	int nr_secflavs;
+
+	READ_BUF(16);
+	COPYMEM(&sess->clientid, 8);
+	READ32(sess->seqid);
+	READ32(sess->flags);
+
+	/* Fore channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->fore_channel.maxreq_sz);
+	READ32(sess->fore_channel.maxresp_sz);
+	READ32(sess->fore_channel.maxresp_cached);
+	READ32(sess->fore_channel.maxops);
+	READ32(sess->fore_channel.maxreqs);
+	READ32(sess->fore_channel.nr_rdma_attrs);
+	if (sess->fore_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->fore_channel.rdma_attrs);
+	} else if (sess->fore_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many fore channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	/* Back channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->back_channel.maxreq_sz);
+	READ32(sess->back_channel.maxresp_sz);
+	READ32(sess->back_channel.maxresp_cached);
+	READ32(sess->back_channel.maxops);
+	READ32(sess->back_channel.maxreqs);
+	READ32(sess->back_channel.nr_rdma_attrs);
+	if (sess->back_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->back_channel.rdma_attrs);
+	} else if (sess->back_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many back channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	READ_BUF(8);
+	READ32(sess->callback_prog);
+
+	/* callback_sec_params4 */
+	READ32(nr_secflavs);
+	for (i = 0; i < nr_secflavs; ++i) {
+		READ_BUF(4);
+		READ32(dummy);
+		switch (dummy) {
+		case RPC_AUTH_NULL:
+			/* Nothing to read */
+			break;
+		case RPC_AUTH_UNIX:
+			READ_BUF(8);
+			/* stamp */
+			READ32(dummy);
+
+			/* machine name */
+			READ32(dummy);
+			READ_BUF(dummy);
+			SAVEMEM(machine_name, dummy);
+
+			/* uid, gid */
+			READ_BUF(8);
+			READ32(sess->uid);
+			READ32(sess->gid);
+
+			/* more gids */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy * 4);
+			for (i = 0; i < dummy; ++i)
+				READ32(dummy);
+			break;
+		case RPC_AUTH_GSS:
+			dprintk("RPC_AUTH_GSS callback secflavor "
+				"not supported!\n");
+			READ_BUF(8);
+			/* gcbp_service */
+			READ32(dummy);
+			/* gcbp_handle_from_server */
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			/* gcbp_handle_from_client */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			break;
+		default:
+			dprintk("Illegal callback secflavor\n");
+			return nfserr_inval;
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	DECODE_HEAD;
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+		      struct nfsd4_sequence *seq)
+{
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(seq->seqid);
+	READ32(seq->slotid);
+	READ32(seq->maxslots);
+	READ32(seq->cachethis);
+
 	DECODE_TAIL;
 }
 
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 static __be32
 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
 {
-	return nfserr_opnotsupp;
+	return nfserr_notsupp;
 }
 
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
 	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
 	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
-	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_noop,
 	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
 	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
 	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
 
+static nfsd4_dec nfsd41_dec_ops[] = {
+	[OP_ACCESS]		(nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		(nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		(nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		(nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	(nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		(nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		(nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		(nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		(nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		(nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		(nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		(nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_DOWNGRADE]	(nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		(nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTROOTFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		(nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		(nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		(nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		(nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RESTOREFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		(nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		(nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_VERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		(nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	(nfsd4_dec)nfsd4_decode_notsupp,
+
+	/* new operations for NFSv4.1 */
+	[OP_BACKCHANNEL_CTL]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_EXCHANGE_ID]	(nfsd4_dec)nfsd4_decode_exchange_id,
+	[OP_CREATE_SESSION]	(nfsd4_dec)nfsd4_decode_create_session,
+	[OP_DESTROY_SESSION]	(nfsd4_dec)nfsd4_decode_destroy_session,
+	[OP_FREE_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GET_DIR_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICEINFO]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICELIST]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTGET]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTRETURN]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SEQUENCE]		(nfsd4_dec)nfsd4_decode_sequence,
+	[OP_SET_SSV]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_TEST_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_WANT_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RECLAIM_COMPLETE]	(nfsd4_dec)nfsd4_decode_notsupp,
+};
+
 struct nfsd4_minorversion_ops {
 	nfsd4_dec *decoders;
 	int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
 
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
 	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+	[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 
 static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 {
 	u32 bmval0 = bmval[0];
 	u32 bmval1 = bmval[1];
+	u32 bmval2 = bmval[2];
 	struct kstat stat;
 	struct svc_fh tempfh;
 	struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	int err;
 	int aclsupport = 0;
 	struct nfs4_acl *acl = NULL;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
 
 	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-	BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
-	BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
+	BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+	BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
 
 	if (exp->ex_fslocs.migrated) {
+		BUG_ON(bmval[2]);
 		status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
 		if (status)
 			goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((buflen -= 16) < 0)
 		goto out_resource;
 
-	WRITE32(2);
-	WRITE32(bmval0);
-	WRITE32(bmval1);
+	if (unlikely(bmval2)) {
+		WRITE32(3);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+		WRITE32(bmval2);
+	} else if (likely(bmval1)) {
+		WRITE32(2);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+	} else {
+		WRITE32(1);
+		WRITE32(bmval0);
+	}
 	attrlenp = p++;                /* to be backfilled later */
 
 	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+		u32 word0 = nfsd_suppattrs0(minorversion);
+		u32 word1 = nfsd_suppattrs1(minorversion);
+		u32 word2 = nfsd_suppattrs2(minorversion);
+
 		if ((buflen -= 12) < 0)
 			goto out_resource;
 		if (!aclsupport)
 			word0 &= ~FATTR4_WORD0_ACL;
 		if (!exp->ex_fslocs.locations)
 			word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
-		WRITE32(2);
-		WRITE32(word0);
-		WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+		if (!word2) {
+			WRITE32(2);
+			WRITE32(word0);
+			WRITE32(word1);
+		} else {
+			WRITE32(3);
+			WRITE32(word0);
+			WRITE32(word1);
+			WRITE32(word2);
+		}
 	}
 	if (bmval0 & FATTR4_WORD0_TYPE) {
 		if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
 		}
 		WRITE64(stat.ino);
 	}
+	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+		WRITE32(3);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+	}
+
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
 	status = nfs_ok;
@@ -1843,6 +2214,15 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
 	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
+	if (!dentry->d_inode) {
+		/*
+		 * nfsd_buffered_readdir drops the i_mutex between
+		 * readdir and calling this callback, leaving a window
+		 * where this directory entry could have gone away.
+		 */
+		dput(dentry);
+		return nfserr_noent;
+	}
 
 	exp_get(exp);
 	/*
@@ -1905,6 +2285,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
 	int buflen;
 	__be32 *p = cd->buffer;
+	__be32 *cookiep;
 	__be32 nfserr = nfserr_toosmall;
 
 	/* In nfsv4, "." and ".." never make it onto the wire.. */
@@ -1921,7 +2302,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 		goto fail;
 
 	*p++ = xdr_one;                             /* mark entry present */
-	cd->offset = p;                             /* remember pointer */
+	cookiep = p;
 	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);    /* offset of next entry */
 	p = xdr_encode_array(p, name, namlen);      /* name length & name */
 
@@ -1935,6 +2316,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 		goto fail;
 	case nfserr_dropit:
 		goto fail;
+	case nfserr_noent:
+		goto skip_entry;
 	default:
 		/*
 		 * If the client requested the RDATTR_ERROR attribute,
@@ -1953,6 +2336,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	}
 	cd->buflen -= (p - cd->buffer);
 	cd->buffer = p;
+	cd->offset = cookiep;
+skip_entry:
 	cd->common.err = nfs_ok;
 	return 0;
 fail:
@@ -2572,6 +2957,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 }
 
 static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+			 struct nfsd4_exchange_id *exid)
+{
+	ENCODE_HEAD;
+	char *major_id;
+	char *server_scope;
+	int major_id_sz;
+	int server_scope_sz;
+	uint64_t minor_id = 0;
+
+	if (nfserr)
+		return nfserr;
+
+	major_id = utsname()->nodename;
+	major_id_sz = strlen(major_id);
+	server_scope = utsname()->nodename;
+	server_scope_sz = strlen(server_scope);
+
+	RESERVE_SPACE(
+		8 /* eir_clientid */ +
+		4 /* eir_sequenceid */ +
+		4 /* eir_flags */ +
+		4 /* spr_how (SP4_NONE) */ +
+		8 /* so_minor_id */ +
+		4 /* so_major_id.len */ +
+		(XDR_QUADLEN(major_id_sz) * 4) +
+		4 /* eir_server_scope.len */ +
+		(XDR_QUADLEN(server_scope_sz) * 4) +
+		4 /* eir_server_impl_id.count (0) */);
+
+	WRITEMEM(&exid->clientid, 8);
+	WRITE32(exid->seqid);
+	WRITE32(exid->flags);
+
+	/* state_protect4_r. Currently only support SP4_NONE */
+	BUG_ON(exid->spa_how != SP4_NONE);
+	WRITE32(exid->spa_how);
+
+	/* The server_owner struct */
+	WRITE64(minor_id);      /* Minor id */
+	/* major id */
+	WRITE32(major_id_sz);
+	WRITEMEM(major_id, major_id_sz);
+
+	/* Server scope */
+	WRITE32(server_scope_sz);
+	WRITEMEM(server_scope, server_scope_sz);
+
+	/* Implementation id */
+	WRITE32(0);	/* zero length nfs_impl_id4 array */
+	ADJUST_ARGS();
+	return 0;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+			    struct nfsd4_create_session *sess)
+{
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(24);
+	WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(sess->seqid);
+	WRITE32(sess->flags);
+	ADJUST_ARGS();
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->fore_channel.maxreq_sz);
+	WRITE32(sess->fore_channel.maxresp_sz);
+	WRITE32(sess->fore_channel.maxresp_cached);
+	WRITE32(sess->fore_channel.maxops);
+	WRITE32(sess->fore_channel.maxreqs);
+	WRITE32(sess->fore_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->fore_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->fore_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->back_channel.maxreq_sz);
+	WRITE32(sess->back_channel.maxresp_sz);
+	WRITE32(sess->back_channel.maxresp_cached);
+	WRITE32(sess->back_channel.maxops);
+	WRITE32(sess->back_channel.maxreqs);
+	WRITE32(sess->back_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->back_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->back_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+	return 0;
+}
+
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	return nfserr;
+}
+
+__be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+		      struct nfsd4_sequence *seq)
+{
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+	WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(seq->seqid);
+	WRITE32(seq->slotid);
+	WRITE32(seq->maxslots);
+	/*
+	 * FIXME: for now:
+	 *   target_maxslots = maxslots
+	 *   status_flags = 0
+	 */
+	WRITE32(seq->maxslots);
+	WRITE32(0);
+
+	ADJUST_ARGS();
+	return 0;
+}
+
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
 	return nfserr;
@@ -2579,6 +3101,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
 
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
 static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
 	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3144,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.1 operations */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
+	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
+	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
 };
 
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+	int status = 0;
+	struct xdr_buf *xb = &resp->rqstp->rq_res;
+	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+	struct nfsd4_session *session = NULL;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	u32 length, tlen = 0, pad = 8;
+
+	if (!nfsd4_has_session(&resp->cstate))
+		return status;
+
+	session = resp->cstate.session;
+	if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+		return status;
+
+	if (resp->opcnt >= args->opcnt)
+		pad = 0; /* this is the last operation */
+
+	if (xb->page_len == 0) {
+		length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+	} else {
+		if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+			tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+
+		length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+	}
+	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+		length, xb->page_len, tlen, pad);
+
+	if (length <= session->se_fmaxresp_cached)
+		return status;
+	else
+		return nfserr_rep_too_big_to_cache;
+}
+
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2635,6 +3231,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
 	       !nfsd4_enc_ops[op->opnum]);
 	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+	/* nfsd4_check_drc_limit guarantees enough room for error status */
+	if (!op->status && nfsd4_check_drc_limit(resp))
+		op->status = nfserr_rep_too_big_to_cache;
 status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3334,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		iov = &rqstp->rq_res.head[0];
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
+	if (nfsd4_has_session(&resp->cstate)) {
+		if (resp->cstate.status == nfserr_replay_cache &&
+				!nfsd4_not_cached(resp)) {
+			iov->iov_len = resp->cstate.iovlen;
+		} else {
+			nfsd4_store_cache_entry(resp);
+			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+			resp->cstate.slot->sl_inuse = 0;
+		}
+		if (resp->cstate.session)
+			nfsd4_put_session(resp->cstate.session);
+	}
 	return 1;
 }
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
 	NFSD_FO_UnlockFS,
 	NFSD_Threads,
 	NFSD_Pool_Threads,
+	NFSD_Pool_Stats,
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
 	.owner		= THIS_MODULE,
 };
 
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+
+static struct file_operations pool_stats_operations = {
+	.open		= nfsd_pool_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
@@ -781,8 +792,9 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
-	char *vers, sign;
+	char *vers, *minorp, sign;
 	int len, num;
+	unsigned minor;
 	ssize_t tlen = 0;
 	char *sep;
 
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 		do {
 			sign = *vers;
 			if (sign == '+' || sign == '-')
-				num = simple_strtol((vers+1), NULL, 0);
+				num = simple_strtol((vers+1), &minorp, 0);
 			else
-				num = simple_strtol(vers, NULL, 0);
+				num = simple_strtol(vers, &minorp, 0);
+			if (*minorp == '.') {
+				if (num < 4)
+					return -EINVAL;
+				minor = simple_strtoul(minorp+1, NULL, 0);
+				if (minor == 0)
+					return -EINVAL;
+				if (nfsd_minorversion(minor, sign == '-' ?
+						     NFSD_CLEAR : NFSD_SET) < 0)
+					return -EINVAL;
+				goto next;
+			}
 			switch(num) {
 			case 2:
 			case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 			default:
 				return -EINVAL;
 			}
+		next:
 			vers += len + 1;
 			tlen += len;
 		} while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 				       num);
 			sep = " ";
 		}
+	if (nfsd_vers(4, NFSD_AVAIL))
+		for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+			len += sprintf(buf+len, " %c4.%u",
+					(nfsd_vers(4, NFSD_TEST) &&
+					 nfsd_minorversion(minor, NFSD_TEST)) ?
+						'+' : '-',
+					minor);
 	len += sprintf(buf+len, "\n");
 	return len;
 }
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 {
 	__be32	nfserr;
 	int	stable = 1;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+			           &cnt,
 				   &stable);
 	return nfsd_return_attrs(nfserr, resp);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7c09852be713..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/swap.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
-static atomic_t			nfsd_busy;
-static unsigned long		nfsd_last_call;
-static DEFINE_SPINLOCK(nfsd_call_lock);
 
 /*
  * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program		nfsd_program = {
 
 };
 
+u32 nfsd_supported_minorversion;
+
 int nfsd_vers(int vers, enum vers_op change)
 {
 	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
 	}
 	return 0;
 }
+
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+		return -1;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_supported_minorversion = minorversion;
+		break;
+	case NFSD_CLEAR:
+		if (minorversion == 0)
+			return -1;
+		nfsd_supported_minorversion = minorversion - 1;
+		break;
+	case NFSD_TEST:
+		return minorversion <= nfsd_supported_minorversion;
+	case NFSD_AVAIL:
+		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+	}
+	return 0;
+}
+
 /*
  * Maximum number of nfsd processes
  */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
 	}
 }
 
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+	/* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+	#define NFSD_DRC_SIZE_SHIFT	7
+	nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+						>> NFSD_DRC_SIZE_SHIFT;
+	nfsd_serv->sv_drc_pages_used = 0;
+	dprintk("%s svc_drc_max_pages %u\n", __func__,
+		nfsd_serv->sv_drc_max_pages);
+}
 
 int nfsd_create_serv(void)
 {
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
 			nfsd_max_blksize /= 2;
 	}
 
-	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
+	else
+		set_max_drc();
 
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
 	return error;
 }
 
-static inline void
-update_thread_usage(int busy_threads)
-{
-	unsigned long prev_call;
-	unsigned long diff;
-	int decile;
-
-	spin_lock(&nfsd_call_lock);
-	prev_call = nfsd_last_call;
-	nfsd_last_call = jiffies;
-	decile = busy_threads*10/nfsdstats.th_cnt;
-	if (decile>0 && decile <= 10) {
-		diff = nfsd_last_call - prev_call;
-		if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
-			nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
-		if (decile == 10)
-			nfsdstats.th_fullcnt++;
-	}
-	spin_unlock(&nfsd_call_lock);
-}
 
 /*
  * This is the NFS server kernel thread
@@ -460,8 +485,6 @@ nfsd(void *vrqstp)
 			continue;
 		}
 
-		update_thread_usage(atomic_read(&nfsd_busy));
-		atomic_inc(&nfsd_busy);
 
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
@@ -470,8 +493,6 @@ nfsd(void *vrqstp)
 
 		/* Unlock export hash tables */
 		exp_readunlock();
-		update_thread_usage(atomic_read(&nfsd_busy));
-		atomic_dec(&nfsd_busy);
 	}
 
 	/* Clear signals before calling svc_exit_thread() */
@@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 		+ rqstp->rq_res.head[0].iov_len;
 	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
 
+	/* NFSv4.1 DRC requires statp */
+	if (rqstp->rq_vers == 4)
+		nfsd4_set_statp(rqstp, statp);
+
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
 	return 1;
 }
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+	if (nfsd_serv == NULL)
+		return -ENODEV;
+	return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..6c68ffd6b4bb 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 	}
 	if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
 		/* successfully crossed mount point */
-		exp_put(exp);
-		*expp = exp2;
+		/*
+		 * This is subtle: dentry is *not* under mnt at this point.
+		 * The only reason we are safe is that original mnt is pinned
+		 * down by exp, so we should dput before putting exp.
+		 */
 		dput(dentry);
 		*dpp = mounts;
+		exp_put(exp);
+		*expp = exp2;
 	} else {
 		exp_put(exp2);
 		dput(mounts);
@@ -366,8 +371,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	}
 
 	/* Revoke setuid/setgid on chown */
-	if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
-	    ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) {
+	if (!S_ISDIR(inode->i_mode) &&
+	    (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+	     ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
 		iap->ia_valid |= ATTR_KILL_PRIV;
 		if (iap->ia_valid & ATTR_MODE) {
 			/* we're setting mode too, just clear the s*id bits */
@@ -960,7 +966,7 @@ static void kill_suid(struct dentry *dentry)
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
-	   			unsigned long cnt, int *stablep)
+				unsigned long *cnt, int *stablep)
 {
 	struct svc_export	*exp;
 	struct dentry		*dentry;
@@ -974,7 +980,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	err = nfserr_perm;
 
 	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-		(!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+		(!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
 		goto out;
 #endif
 
@@ -1009,7 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
 	set_fs(oldfs);
 	if (host_err >= 0) {
-		nfsdstats.io_write += cnt;
+		nfsdstats.io_write += host_err;
 		fsnotify_modify(file->f_path.dentry);
 	}
 
@@ -1054,9 +1060,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	dprintk("nfsd: write complete host_err=%d\n", host_err);
-	if (host_err >= 0)
+	if (host_err >= 0) {
 		err = 0;
-	else 
+		*cnt = host_err;
+	} else
 		err = nfserrno(host_err);
 out:
 	return err;
@@ -1098,7 +1105,7 @@ out:
  */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-		loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+		loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
 		int *stablep)
 {
 	__be32			err = 0;
@@ -1179,6 +1186,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
 	return 0;
 }
 
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+	if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+		iap->ia_valid &= ~ATTR_SIZE;
+}
+
 /*
  * Create a file (regular, directory, device, fifo); UNIX sockets 
  * not yet implemented.
@@ -1274,6 +1296,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	switch (type) {
 	case S_IFREG:
 		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+		if (!host_err)
+			nfsd_check_ignore_resizing(iap);
 		break;
 	case S_IFDIR:
 		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1451,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		/* setattr will sync the child (or not) */
 	}
 
+	nfsd_check_ignore_resizing(iap);
+
 	if (createmode == NFS3_CREATE_EXCLUSIVE) {
 		/* Cram the verifier into atime/mtime */
 		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
@@ -1864,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
 	return 0;
 }
 
-static int nfsd_buffered_readdir(struct file *file, filldir_t func,
-				 struct readdir_cd *cdp, loff_t *offsetp)
+static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
+				    struct readdir_cd *cdp, loff_t *offsetp)
 {
 	struct readdir_data buf;
 	struct buffered_dirent *de;
@@ -1875,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 
 	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
 	if (!buf.dirent)
-		return -ENOMEM;
+		return nfserrno(-ENOMEM);
 
 	offset = *offsetp;
 
 	while (1) {
+		struct inode *dir_inode = file->f_path.dentry->d_inode;
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1898,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 		if (!size)
 			break;
 
+		/*
+		 * Various filldir functions may end up calling back into
+		 * lookup_one_len() and the file system's ->lookup() method.
+		 * These expect i_mutex to be held, as it would within readdir.
+		 */
+		host_err = mutex_lock_killable(&dir_inode->i_mutex);
+		if (host_err)
+			break;
+
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
 
 			if (func(cdp, de->name, de->namlen, de->offset,
 				 de->ino, de->d_type))
-				goto done;
+				break;
 
 			if (cdp->err != nfs_ok)
-				goto done;
+				break;
 
 			reclen = ALIGN(sizeof(*de) + de->namlen,
 				       sizeof(u64));
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
+		mutex_unlock(&dir_inode->i_mutex);
+		if (size > 0) /* We bailed out early */
+			break;
+
 		offset = vfs_llseek(file, 0, SEEK_CUR);
 	}
 
- done:
 	free_page((unsigned long)(buf.dirent));
 
 	if (host_err)
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 000000000000..df3e62c1ddc5
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_NILFS2_FS) += nilfs2.o
+nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
+	btnode.o bmap.o btree.o direct.o dat.o recovery.o \
+	the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
+	ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 000000000000..d69e6ae59251
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
+/*
+ * alloc.c - NILFS dat/inode allocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include "mdt.h"
+#include "alloc.h"
+
+
+static inline unsigned long
+nilfs_palloc_groups_per_desc_block(const struct inode *inode)
+{
+	return (1UL << inode->i_blkbits) /
+		sizeof(struct nilfs_palloc_group_desc);
+}
+
+static inline unsigned long
+nilfs_palloc_groups_count(const struct inode *inode)
+{
+	return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
+}
+
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
+	if (!mi->mi_bgl)
+		return -ENOMEM;
+
+	bgl_lock_init(mi->mi_bgl);
+
+	nilfs_mdt_set_entry_size(inode, entry_size, 0);
+
+	mi->mi_blocks_per_group =
+		DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
+			     mi->mi_entries_per_block) + 1;
+		/* Number of blocks in a group including entry blocks and
+		   a bitmap block */
+	mi->mi_blocks_per_desc_block =
+		nilfs_palloc_groups_per_desc_block(inode) *
+		mi->mi_blocks_per_group + 1;
+		/* Number of blocks per descriptor including the
+		   descriptor block */
+	return 0;
+}
+
+static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
+					unsigned long *offset)
+{
+	__u64 group = nr;
+
+	*offset = do_div(group, nilfs_palloc_entries_per_group(inode));
+	return group;
+}
+
+static unsigned long
+nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_block =
+		group / nilfs_palloc_groups_per_desc_block(inode);
+	return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
+}
+
+static unsigned long
+nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_offset =
+		group % nilfs_palloc_groups_per_desc_block(inode);
+	return nilfs_palloc_desc_blkoff(inode, group) + 1 +
+		desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
+}
+
+static unsigned long
+nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
+			       const struct nilfs_palloc_group_desc *desc)
+{
+	unsigned long nfree;
+
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	nfree = le32_to_cpu(desc->pg_nfrees);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+	return nfree;
+}
+
+static void
+nilfs_palloc_group_desc_add_entries(struct inode *inode,
+				    unsigned long group,
+				    struct nilfs_palloc_group_desc *desc,
+				    u32 n)
+{
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	le32_add_cpu(&desc->pg_nfrees, n);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+}
+
+static unsigned long
+nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
+{
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, nr, &group_offset);
+
+	return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
+		group_offset / NILFS_MDT(inode)->mi_entries_per_block;
+}
+
+static void nilfs_palloc_desc_block_init(struct inode *inode,
+					 struct buffer_head *bh, void *kaddr)
+{
+	struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
+	unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
+	__le32 nfrees;
+
+	nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
+	while (n-- > 0) {
+		desc->pg_nfrees = nfrees;
+		desc++;
+	}
+}
+
+static int nilfs_palloc_get_desc_block(struct inode *inode,
+				       unsigned long group,
+				       int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(inode,
+				   nilfs_palloc_desc_blkoff(inode, group),
+				   create, nilfs_palloc_desc_block_init, bhp);
+}
+
+static int nilfs_palloc_get_bitmap_block(struct inode *inode,
+					 unsigned long group,
+					 int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(inode,
+				   nilfs_palloc_bitmap_blkoff(inode, group),
+				   create, NULL, bhp);
+}
+
+int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
+				 int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
+				   create, NULL, bhp);
+}
+
+static struct nilfs_palloc_group_desc *
+nilfs_palloc_block_get_group_desc(const struct inode *inode,
+				  unsigned long group,
+				  const struct buffer_head *bh, void *kaddr)
+{
+	return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
+		group % nilfs_palloc_groups_per_desc_block(inode);
+}
+
+static unsigned char *
+nilfs_palloc_block_get_bitmap(const struct inode *inode,
+			      const struct buffer_head *bh, void *kaddr)
+{
+	return (unsigned char *)(kaddr + bh_offset(bh));
+}
+
+void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
+				   const struct buffer_head *bh, void *kaddr)
+{
+	unsigned long entry_offset, group_offset;
+
+	nilfs_palloc_group(inode, nr, &group_offset);
+	entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
+
+	return kaddr + bh_offset(bh) +
+		entry_offset * NILFS_MDT(inode)->mi_entry_size;
+}
+
+static int nilfs_palloc_find_available_slot(struct inode *inode,
+					    unsigned long group,
+					    unsigned long target,
+					    unsigned char *bitmap,
+					    int bsize)  /* size in bits */
+{
+	int curr, pos, end, i;
+
+	if (target > 0) {
+		end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
+		if (end > bsize)
+			end = bsize;
+		pos = nilfs_find_next_zero_bit(bitmap, end, target);
+		if (pos < end &&
+		    !nilfs_set_bit_atomic(
+			    nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+			return pos;
+	} else
+		end = 0;
+
+	for (i = 0, curr = end;
+	     i < bsize;
+	     i += BITS_PER_LONG, curr += BITS_PER_LONG) {
+		/* wrap around */
+		if (curr >= bsize)
+			curr = 0;
+		while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
+		       != ~0UL) {
+			end = curr + BITS_PER_LONG;
+			if (end > bsize)
+				end = bsize;
+			pos = nilfs_find_next_zero_bit(bitmap, end, curr);
+			if ((pos < end) &&
+			    !nilfs_set_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group), pos,
+				    bitmap))
+				return pos;
+		}
+	}
+	return -ENOSPC;
+}
+
+static unsigned long
+nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
+				       unsigned long curr, unsigned long max)
+{
+	return min_t(unsigned long,
+		     nilfs_palloc_groups_per_desc_block(inode) -
+		     curr % nilfs_palloc_groups_per_desc_block(inode),
+		     max - curr + 1);
+}
+
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, maxgroup, ngroups;
+	unsigned long group_offset, maxgroup_offset;
+	unsigned long n, entries_per_group, groups_per_desc_block;
+	unsigned long i, j;
+	int pos, ret;
+
+	ngroups = nilfs_palloc_groups_count(inode);
+	maxgroup = ngroups - 1;
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	entries_per_group = nilfs_palloc_entries_per_group(inode);
+	groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
+
+	for (i = 0; i < ngroups; i += n) {
+		if (group >= ngroups) {
+			/* wrap around */
+			group = 0;
+			maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
+						      &maxgroup_offset) - 1;
+		}
+		ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+		if (ret < 0)
+			return ret;
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
+							   maxgroup);
+		for (j = 0; j < n; j++, desc++, group++) {
+			if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
+			    > 0) {
+				ret = nilfs_palloc_get_bitmap_block(
+					inode, group, 1, &bitmap_bh);
+				if (ret < 0)
+					goto out_desc;
+				bitmap_kaddr = kmap(bitmap_bh->b_page);
+				bitmap = nilfs_palloc_block_get_bitmap(
+					inode, bitmap_bh, bitmap_kaddr);
+				pos = nilfs_palloc_find_available_slot(
+					inode, group, group_offset, bitmap,
+					entries_per_group);
+				if (pos >= 0) {
+					/* found a free entry */
+					nilfs_palloc_group_desc_add_entries(
+						inode, group, desc, -1);
+					req->pr_entry_nr =
+						entries_per_group * group + pos;
+					kunmap(desc_bh->b_page);
+					kunmap(bitmap_bh->b_page);
+
+					req->pr_desc_bh = desc_bh;
+					req->pr_bitmap_bh = bitmap_bh;
+					return 0;
+				}
+				kunmap(bitmap_bh->b_page);
+				brelse(bitmap_bh);
+			}
+
+			group_offset = 0;
+		}
+
+		kunmap(desc_bh->b_page);
+		brelse(desc_bh);
+	}
+
+	/* no entries left */
+	return -ENOSPC;
+
+ out_desc:
+	kunmap(desc_bh->b_page);
+	brelse(desc_bh);
+	return ret;
+}
+
+void nilfs_palloc_commit_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+	nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+void nilfs_palloc_commit_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	unsigned long group, group_offset;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+					       bitmap_kaddr);
+
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+
+	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+	nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+void nilfs_palloc_abort_alloc_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned char *bitmap;
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+					       bitmap_kaddr);
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+
+	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+int nilfs_palloc_prepare_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	unsigned long group, group_offset;
+	int ret;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+	if (ret < 0)
+		return ret;
+	ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
+	if (ret < 0) {
+		brelse(desc_bh);
+		return ret;
+	}
+
+	req->pr_desc_bh = desc_bh;
+	req->pr_bitmap_bh = bitmap_bh;
+	return 0;
+}
+
+void nilfs_palloc_abort_free_entry(struct inode *inode,
+				   struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+static int
+nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
+{
+	__u64 first, last;
+
+	first = group * nilfs_palloc_entries_per_group(inode);
+	last = first + nilfs_palloc_entries_per_group(inode) - 1;
+	return (nr >= first) && (nr <= last);
+}
+
+int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, group_offset;
+	int i, j, n, ret;
+
+	for (i = 0; i < nitems; i += n) {
+		group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
+		ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
+		if (ret < 0)
+			return ret;
+		ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
+						    &bitmap_bh);
+		if (ret < 0) {
+			brelse(desc_bh);
+			return ret;
+		}
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		bitmap_kaddr = kmap(bitmap_bh->b_page);
+		bitmap = nilfs_palloc_block_get_bitmap(
+			inode, bitmap_bh, bitmap_kaddr);
+		for (j = i, n = 0;
+		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
+							      entry_nrs[j]);
+		     j++, n++) {
+			nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+			if (!nilfs_clear_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap)) {
+				printk(KERN_WARNING
+				       "%s: entry number %llu already freed\n",
+				       __func__,
+				       (unsigned long long)entry_nrs[j]);
+			}
+		}
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+		kunmap(bitmap_bh->b_page);
+		kunmap(desc_bh->b_page);
+
+		nilfs_mdt_mark_buffer_dirty(desc_bh);
+		nilfs_mdt_mark_buffer_dirty(bitmap_bh);
+		nilfs_mdt_mark_dirty(inode);
+
+		brelse(bitmap_bh);
+		brelse(desc_bh);
+	}
+	return 0;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 000000000000..4ace5475c2c7
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
+/*
+ * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#ifndef _NILFS_ALLOC_H
+#define _NILFS_ALLOC_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+static inline unsigned long
+nilfs_palloc_entries_per_group(const struct inode *inode)
+{
+	return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
+}
+
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
+				 struct buffer_head **);
+void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
+				   const struct buffer_head *, void *);
+
+/**
+ * nilfs_palloc_req - persistent alloctor request and reply
+ * @pr_entry_nr: entry number (vblocknr or inode number)
+ * @pr_desc_bh: buffer head of the buffer containing block group descriptors
+ * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
+ * @pr_entry_bh: buffer head of the buffer containing translation entries
+ */
+struct nilfs_palloc_req {
+	__u64 pr_entry_nr;
+	struct buffer_head *pr_desc_bh;
+	struct buffer_head *pr_bitmap_bh;
+	struct buffer_head *pr_entry_bh;
+};
+
+int nilfs_palloc_prepare_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_commit_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
+
+#define nilfs_set_bit_atomic		ext2_set_bit_atomic
+#define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
+#define nilfs_find_next_zero_bit	ext2_find_next_zero_bit
+
+#endif	/* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 000000000000..064279e33bbb
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,788 @@
+/*
+ * bmap.c - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "bmap.h"
+#include "sb.h"
+#include "btnode.h"
+#include "mdt.h"
+#include "dat.h"
+#include "alloc.h"
+
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
+			       __u64 *ptrp)
+{
+	__u64 ptr;
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
+	if (ret < 0)
+		goto out;
+	if (bmap->b_pops->bpop_translate != NULL) {
+		ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
+		if (ret < 0)
+			goto out;
+		*ptrp = ptr;
+	}
+
+ out:
+	up_read(&bmap->b_sem);
+	return ret;
+}
+
+
+/**
+ * nilfs_bmap_lookup - find a record
+ * @bmap: bmap
+ * @key: key
+ * @recp: pointer to record
+ *
+ * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
+ * @bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @recp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
+		      unsigned long key,
+		      unsigned long *recp)
+{
+	__u64 ptr;
+	int ret;
+
+	/* XXX: use macro for level 1 */
+	ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
+	if (recp != NULL)
+		*recp = ptr;
+	return ret;
+}
+
+static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
+	__u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_insert != NULL) {
+		ret = bmap->b_ops->bop_check_insert(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_btree_convert_and_insert(
+				bmap, key, ptr, keys, ptrs, n,
+				NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
+			if (ret == 0)
+				bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_insert(bmap, key, ptr);
+}
+
+/**
+ * nilfs_bmap_insert - insert a new key-record pair into a bmap
+ * @bmap: bmap
+ * @key: key
+ * @rec: record
+ *
+ * Description: nilfs_bmap_insert() inserts the new key-record pair specified
+ * by @key and @rec into @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EEXIST - A record associated with @key already exist.
+ */
+int nilfs_bmap_insert(struct nilfs_bmap *bmap,
+		      unsigned long key,
+		      unsigned long rec)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_insert(bmap, key, rec);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	__u64 keys[NILFS_BMAP_LARGE_LOW + 1];
+	__u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_delete != NULL) {
+		ret = bmap->b_ops->bop_check_delete(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_direct_delete_and_convert(
+				bmap, key, keys, ptrs, n,
+				NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
+			if (ret == 0)
+				bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_delete(bmap, key);
+}
+
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+{
+	__u64 lastkey;
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+	if (!ret)
+		*key = lastkey;
+	up_read(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_delete - delete a key-record pair from a bmap
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_delete() deletes the key-record pair specified by
+ * @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_delete(bmap, key);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+	__u64 lastkey;
+	int ret;
+
+	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		return ret;
+	}
+
+	while (key <= lastkey) {
+		ret = nilfs_bmap_do_delete(bmap, lastkey);
+		if (ret < 0)
+			return ret;
+		ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * nilfs_bmap_truncate - truncate a bmap to a specified key
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
+ * greater than or equal to @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_truncate(bmap, key);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_clear - free resources a bmap holds
+ * @bmap: bmap
+ *
+ * Description: nilfs_bmap_clear() frees resources associated with @bmap.
+ */
+void nilfs_bmap_clear(struct nilfs_bmap *bmap)
+{
+	down_write(&bmap->b_sem);
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+	up_write(&bmap->b_sem);
+}
+
+/**
+ * nilfs_bmap_propagate - propagate dirty state
+ * @bmap: bmap
+ * @bh: buffer head
+ *
+ * Description: nilfs_bmap_propagate() marks the buffers that directly or
+ * indirectly refer to the block specified by @bh dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_propagate(bmap, bh);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_lookup_dirty_buffers -
+ * @bmap: bmap
+ * @listp: pointer to buffer head list
+ */
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+				     struct list_head *listp)
+{
+	if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
+		bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
+}
+
+/**
+ * nilfs_bmap_assign - assign a new block number to a block
+ * @bmap: bmap
+ * @bhp: pointer to buffer head
+ * @blocknr: block number
+ * @binfo: block information
+ *
+ * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
+ * buffer specified by @bh.
+ *
+ * Return Value: On success, 0 is returned and the buffer head of a newly
+ * create buffer and the block information associated with the buffer are
+ * stored in the place pointed by @bh and @binfo, respectively. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_assign(struct nilfs_bmap *bmap,
+		      struct buffer_head **bh,
+		      unsigned long blocknr,
+		      union nilfs_binfo *binfo)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_mark - mark block dirty
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ *
+ * Description: nilfs_bmap_mark() marks the block specified by @key and @level
+ * as dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+	int ret;
+
+	if (bmap->b_ops->bop_mark == NULL)
+		return 0;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_mark(bmap, key, level);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
+ * @bmap: bmap
+ *
+ * Description: nilfs_test_and_clear() is the atomic operation to test and
+ * clear the dirty state of @bmap.
+ *
+ * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
+ */
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_dirty(bmap);
+	nilfs_bmap_clear_dirty(bmap);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+
+/*
+ * Internal use only
+ */
+
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
+{
+	inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+	if (NILFS_MDT(bmap->b_inode))
+		nilfs_mdt_mark_dirty(bmap->b_inode);
+	else
+		mark_inode_dirty(bmap->b_inode);
+}
+
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
+{
+	inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+	if (NILFS_MDT(bmap->b_inode))
+		nilfs_mdt_mark_dirty(bmap->b_inode);
+	else
+		mark_inode_dirty(bmap->b_inode);
+}
+
+int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
+			 struct buffer_head **bhp)
+{
+	return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+				ptr, 0, bhp, 0);
+}
+
+void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
+			  struct buffer_head *bh)
+{
+	brelse(bh);
+}
+
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
+			     struct buffer_head **bhp)
+{
+	int ret;
+
+	ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+			       ptr, 0, bhp, 1);
+	if (ret < 0)
+		return ret;
+	set_buffer_nilfs_volatile(*bhp);
+	return 0;
+}
+
+void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
+			     struct buffer_head *bh)
+{
+	nilfs_btnode_delete(bh);
+}
+
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
+			      const struct buffer_head *bh)
+{
+	struct buffer_head *pbh;
+	__u64 key;
+
+	key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+					 bmap->b_inode->i_blkbits);
+	for (pbh = page_buffers(bh->b_page); pbh != bh;
+	     pbh = pbh->b_this_page, key++);
+
+	return key;
+}
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
+{
+	__s64 diff;
+
+	diff = key - bmap->b_last_allocated_key;
+	if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
+	    (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
+	    (bmap->b_last_allocated_ptr + diff > 0))
+		return bmap->b_last_allocated_ptr + diff;
+	else
+		return NILFS_BMAP_INVALID_PTR;
+}
+
+static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+	return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
+
+#define NILFS_BMAP_GROUP_DIV	8
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
+{
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
+	unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
+	unsigned long group = bmap->b_inode->i_ino / entries_per_group;
+
+	return group * entries_per_group +
+		(bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
+		(entries_per_group / NILFS_BMAP_GROUP_DIV);
+}
+
+static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+				     union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req,
+				      sector_t blocknr)
+{
+	nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+			       blocknr);
+}
+
+static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
+				     union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+				    union nilfs_bmap_ptr_req *req)
+{
+	return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+				    union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
+}
+
+static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
+				       union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+}
+
+static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+				   union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
+		      sector_t blocknr)
+{
+	return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
+}
+
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
+{
+	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
+}
+
+int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
+			      union nilfs_bmap_ptr_req *oldreq,
+			      union nilfs_bmap_ptr_req *newreq)
+{
+	int ret;
+
+	ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+	if (ret < 0)
+		return ret;
+	ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+	if (ret < 0)
+		bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+
+	return ret;
+}
+
+void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
+			      union nilfs_bmap_ptr_req *oldreq,
+			      union nilfs_bmap_ptr_req *newreq)
+{
+	bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
+	bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
+}
+
+void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
+			     union nilfs_bmap_ptr_req *oldreq,
+			     union nilfs_bmap_ptr_req *newreq)
+{
+	bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+	bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
+}
+
+static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
+				  __u64 *ptrp)
+{
+	sector_t blocknr;
+	int ret;
+
+	ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
+	if (ret < 0)
+		return ret;
+	if (ptrp != NULL)
+		*ptrp = blocknr;
+	return 0;
+}
+
+static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	/* ignore target ptr */
+	req->bpr_ptr = bmap->b_last_allocated_ptr++;
+	return 0;
+}
+
+static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	/* do nothing */
+}
+
+static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
+				     union nilfs_bmap_ptr_req *req)
+{
+	bmap->b_last_allocated_ptr--;
+}
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
+	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_v,
+	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_v,
+	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_v,
+	.bpop_prepare_start_ptr	=	nilfs_bmap_prepare_start_v,
+	.bpop_commit_start_ptr	=	nilfs_bmap_commit_start_v,
+	.bpop_abort_start_ptr	=	nilfs_bmap_abort_start_v,
+	.bpop_prepare_end_ptr	=	nilfs_bmap_prepare_end_v,
+	.bpop_commit_end_ptr	=	nilfs_bmap_commit_end_v,
+	.bpop_abort_end_ptr	=	nilfs_bmap_abort_end_v,
+
+	.bpop_translate		=	nilfs_bmap_translate_v,
+};
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
+	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_v,
+	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_v,
+	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_v,
+	.bpop_prepare_start_ptr	=	nilfs_bmap_prepare_start_v,
+	.bpop_commit_start_ptr	=	nilfs_bmap_commit_start_v,
+	.bpop_abort_start_ptr	=	nilfs_bmap_abort_start_v,
+	.bpop_prepare_end_ptr	=	nilfs_bmap_prepare_end_v,
+	.bpop_commit_end_ptr	=	nilfs_bmap_commit_end_vmdt,
+	.bpop_abort_end_ptr	=	nilfs_bmap_abort_end_v,
+
+	.bpop_translate		=	nilfs_bmap_translate_v,
+};
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
+	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_p,
+	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_p,
+	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_p,
+	.bpop_prepare_start_ptr	=	NULL,
+	.bpop_commit_start_ptr	=	NULL,
+	.bpop_abort_start_ptr	=	NULL,
+	.bpop_prepare_end_ptr	=	NULL,
+	.bpop_commit_end_ptr	=	NULL,
+	.bpop_abort_end_ptr	=	NULL,
+
+	.bpop_translate		=	NULL,
+};
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
+	.bpop_prepare_alloc_ptr	=	NULL,
+	.bpop_commit_alloc_ptr	=	NULL,
+	.bpop_abort_alloc_ptr	=	NULL,
+	.bpop_prepare_start_ptr	=	NULL,
+	.bpop_commit_start_ptr	=	NULL,
+	.bpop_abort_start_ptr	=	NULL,
+	.bpop_prepare_end_ptr	=	NULL,
+	.bpop_commit_end_ptr	=	NULL,
+	.bpop_abort_end_ptr	=	NULL,
+
+	.bpop_translate		=	NULL,
+};
+
+static struct lock_class_key nilfs_bmap_dat_lock_key;
+
+/**
+ * nilfs_bmap_read - read a bmap from an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_read() initializes the bmap @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	if (raw_inode == NULL)
+		memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
+	else
+		memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
+
+	init_rwsem(&bmap->b_sem);
+	bmap->b_state = 0;
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		bmap->b_pops = &nilfs_bmap_ptr_ops_p;
+		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+		break;
+	case NILFS_CPFILE_INO:
+	case NILFS_SUFILE_INO:
+		bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
+		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		break;
+	default:
+		bmap->b_pops = &nilfs_bmap_ptr_ops_v;
+		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		break;
+	}
+
+	return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
+		nilfs_btree_init(bmap,
+				 NILFS_BMAP_LARGE_LOW,
+				 NILFS_BMAP_LARGE_HIGH) :
+		nilfs_direct_init(bmap,
+				  NILFS_BMAP_SMALL_LOW,
+				  NILFS_BMAP_SMALL_HIGH);
+}
+
+/**
+ * nilfs_bmap_write - write back a bmap to an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
+ */
+void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	down_write(&bmap->b_sem);
+	memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
+	       NILFS_INODE_BMAP_SIZE * sizeof(__le64));
+	if (bmap->b_inode->i_ino == NILFS_DAT_INO)
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+
+	up_write(&bmap->b_sem);
+}
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
+{
+	memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
+	init_rwsem(&bmap->b_sem);
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+	bmap->b_last_allocated_key = 0;
+	bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+	bmap->b_state = 0;
+	nilfs_btree_init_gc(bmap);
+}
+
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+	memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
+	init_rwsem(&gcbmap->b_sem);
+	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+	gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
+}
+
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+	memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
+	init_rwsem(&bmap->b_sem);
+	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 000000000000..4f2708abb1ba
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
+/*
+ * bmap.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BMAP_H
+#define _NILFS_BMAP_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "alloc.h"
+
+#define NILFS_BMAP_INVALID_PTR	0
+
+#define nilfs_bmap_dkey_to_key(dkey)	le64_to_cpu(dkey)
+#define nilfs_bmap_key_to_dkey(key)	cpu_to_le64(key)
+#define nilfs_bmap_dptr_to_ptr(dptr)	le64_to_cpu(dptr)
+#define nilfs_bmap_ptr_to_dptr(ptr)	cpu_to_le64(ptr)
+
+#define nilfs_bmap_keydiff_abs(diff)	((diff) < 0 ? -(diff) : (diff))
+
+
+struct nilfs_bmap;
+
+/**
+ * union nilfs_bmap_ptr_req - request for bmap ptr
+ * @bpr_ptr: bmap pointer
+ * @bpr_req: request for persistent allocator
+ */
+union nilfs_bmap_ptr_req {
+	__u64 bpr_ptr;
+	struct nilfs_palloc_req bpr_req;
+};
+
+/**
+ * struct nilfs_bmap_stats - bmap statistics
+ * @bs_nblocks: number of blocks created or deleted
+ */
+struct nilfs_bmap_stats {
+	unsigned int bs_nblocks;
+};
+
+/**
+ * struct nilfs_bmap_operations - bmap operation table
+ */
+struct nilfs_bmap_operations {
+	int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+	int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
+	int (*bop_delete)(struct nilfs_bmap *, __u64);
+	void (*bop_clear)(struct nilfs_bmap *);
+
+	int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
+	void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
+					 struct list_head *);
+
+	int (*bop_assign)(struct nilfs_bmap *,
+			  struct buffer_head **,
+			  sector_t,
+			  union nilfs_binfo *);
+	int (*bop_mark)(struct nilfs_bmap *, __u64, int);
+
+	/* The following functions are internal use only. */
+	int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+	int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
+	int (*bop_check_delete)(struct nilfs_bmap *, __u64);
+	int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
+};
+
+
+/**
+ * struct nilfs_bmap_ptr_operations - bmap ptr operation table
+ */
+struct nilfs_bmap_ptr_operations {
+	int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *);
+	void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *);
+	void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
+				     union nilfs_bmap_ptr_req *);
+	int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *);
+	void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *,
+				      sector_t);
+	void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
+				     union nilfs_bmap_ptr_req *);
+	int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
+				    union nilfs_bmap_ptr_req *);
+	void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
+				    union nilfs_bmap_ptr_req *);
+	void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
+				   union nilfs_bmap_ptr_req *);
+
+	int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
+};
+
+
+#define NILFS_BMAP_SIZE		(NILFS_INODE_BMAP_SIZE * sizeof(__le64))
+#define NILFS_BMAP_KEY_BIT	(sizeof(unsigned long) * 8 /* CHAR_BIT */)
+#define NILFS_BMAP_NEW_PTR_INIT	\
+	(1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
+
+static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
+{
+	return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
+}
+
+
+/**
+ * struct nilfs_bmap - bmap structure
+ * @b_u: raw data
+ * @b_sem: semaphore
+ * @b_inode: owner of bmap
+ * @b_ops: bmap operation table
+ * @b_pops: bmap ptr operation table
+ * @b_low: low watermark of conversion
+ * @b_high: high watermark of conversion
+ * @b_last_allocated_key: last allocated key for data block
+ * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_state: state
+ */
+struct nilfs_bmap {
+	union {
+		__u8 u_flags;
+		__le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	} b_u;
+	struct rw_semaphore b_sem;
+	struct inode *b_inode;
+	const struct nilfs_bmap_operations *b_ops;
+	const struct nilfs_bmap_ptr_operations *b_pops;
+	__u64 b_low;
+	__u64 b_high;
+	__u64 b_last_allocated_key;
+	__u64 b_last_allocated_ptr;
+	int b_state;
+};
+
+/* state */
+#define NILFS_BMAP_DIRTY	0x00000001
+
+
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
+int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
+void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
+int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
+int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
+int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+void nilfs_bmap_clear(struct nilfs_bmap *);
+int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
+int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
+		      unsigned long, union nilfs_binfo *);
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
+int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *);
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+
+
+/*
+ * Internal use only
+ */
+
+int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
+
+
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
+			      const struct buffer_head *);
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
+
+int nilfs_bmap_prepare_update(struct nilfs_bmap *,
+			      union nilfs_bmap_ptr_req *,
+			      union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_update(struct nilfs_bmap *,
+			      union nilfs_bmap_ptr_req *,
+			      union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_update(struct nilfs_bmap *,
+			     union nilfs_bmap_ptr_req *,
+			     union nilfs_bmap_ptr_req *);
+
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
+
+
+int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
+			 struct buffer_head **);
+void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
+			     struct buffer_head **);
+void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
+
+
+/* Assume that bmap semaphore is locked. */
+static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
+{
+	return !!(bmap->b_state & NILFS_BMAP_DIRTY);
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state |= NILFS_BMAP_DIRTY;
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state &= ~NILFS_BMAP_DIRTY;
+}
+
+
+#define NILFS_BMAP_LARGE	0x1
+
+#define NILFS_BMAP_SMALL_LOW	NILFS_DIRECT_KEY_MIN
+#define NILFS_BMAP_SMALL_HIGH	NILFS_DIRECT_KEY_MAX
+#define NILFS_BMAP_LARGE_LOW	NILFS_BTREE_ROOT_NCHILDREN_MAX
+#define NILFS_BMAP_LARGE_HIGH	NILFS_BTREE_KEY_MAX
+
+#endif	/* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 000000000000..d41509bff47b
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
+/*
+ * bmap_union.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BMAP_UNION_H
+#define _NILFS_BMAP_UNION_H
+
+#include "bmap.h"
+#include "direct.h"
+#include "btree.h"
+
+/**
+ * nilfs_bmap_union -
+ * @bi_bmap: bmap structure
+ * @bi_btree: direct map structure
+ * @bi_direct: B-tree structure
+ */
+union nilfs_bmap_union {
+	struct nilfs_bmap bi_bmap;
+	struct nilfs_direct bi_direct;
+	struct nilfs_btree bi_btree;
+};
+
+#endif	/* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 000000000000..4cc07b2c30e0
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
+/*
+ * btnode.c - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * This file was originally written by Seiji Kihara <kihara@osrg.net>
+ * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
+ * stabilization and simplification.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "dat.h"
+#include "page.h"
+#include "btnode.h"
+
+
+void nilfs_btnode_cache_init_once(struct address_space *btnc)
+{
+	INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
+	spin_lock_init(&btnc->tree_lock);
+	INIT_LIST_HEAD(&btnc->private_list);
+	spin_lock_init(&btnc->private_lock);
+
+	spin_lock_init(&btnc->i_mmap_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
+	INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
+}
+
+static struct address_space_operations def_btnode_aops;
+
+void nilfs_btnode_cache_init(struct address_space *btnc)
+{
+	btnc->host = NULL;  /* can safely set to host inode ? */
+	btnc->flags = 0;
+	mapping_set_gfp_mask(btnc, GFP_NOFS);
+	btnc->assoc_mapping = NULL;
+	btnc->backing_dev_info = &default_backing_dev_info;
+	btnc->a_ops = &def_btnode_aops;
+}
+
+void nilfs_btnode_cache_clear(struct address_space *btnc)
+{
+	invalidate_mapping_pages(btnc, 0, -1);
+	truncate_inode_pages(btnc, 0);
+}
+
+int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
+			      sector_t pblocknr, struct buffer_head **pbh,
+			      int newblk)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	int err;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	err = -EEXIST; /* internal code */
+	if (newblk) {
+		if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+			     buffer_dirty(bh))) {
+			brelse(bh);
+			BUG();
+		}
+		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+		bh->b_blocknr = blocknr;
+		set_buffer_mapped(bh);
+		set_buffer_uptodate(bh);
+		goto found;
+	}
+
+	if (buffer_uptodate(bh) || buffer_dirty(bh))
+		goto found;
+
+	if (pblocknr == 0) {
+		pblocknr = blocknr;
+		if (inode->i_ino != NILFS_DAT_INO) {
+			struct inode *dat =
+				nilfs_dat_inode(NILFS_I_NILFS(inode));
+
+			/* blocknr is a virtual block number */
+			err = nilfs_dat_translate(dat, blocknr, &pblocknr);
+			if (unlikely(err)) {
+				brelse(bh);
+				goto out_locked;
+			}
+		}
+	}
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		err = -EEXIST; /* internal code */
+		goto found;
+	}
+	set_buffer_mapped(bh);
+	bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+	bh->b_blocknr = pblocknr; /* set block address for read */
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	bh->b_blocknr = blocknr; /* set back to the given block address */
+	err = 0;
+found:
+	*pbh = bh;
+
+out_locked:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return err;
+}
+
+int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
+		     sector_t pblocknr, struct buffer_head **pbh, int newblk)
+{
+	struct buffer_head *bh;
+	int err;
+
+	err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
+	if (err == -EEXIST) /* internal code (cache hit) */
+		return 0;
+	if (unlikely(err))
+		return err;
+
+	bh = *pbh;
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EIO;
+	}
+	return 0;
+}
+
+/**
+ * nilfs_btnode_delete - delete B-tree node buffer
+ * @bh: buffer to be deleted
+ *
+ * nilfs_btnode_delete() invalidates the specified buffer and delete the page
+ * including the buffer if the page gets unbusy.
+ */
+void nilfs_btnode_delete(struct buffer_head *bh)
+{
+	struct address_space *mapping;
+	struct page *page = bh->b_page;
+	pgoff_t index = page_index(page);
+	int still_dirty;
+
+	page_cache_get(page);
+	lock_page(page);
+	wait_on_page_writeback(page);
+
+	nilfs_forget_buffer(bh);
+	still_dirty = PageDirty(page);
+	mapping = page->mapping;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (!still_dirty && mapping)
+		invalidate_inode_pages2_range(mapping, index, index);
+}
+
+/**
+ * nilfs_btnode_prepare_change_key
+ *  prepare to move contents of the block for old key to one of new key.
+ *  the old buffer will not be removed, but might be reused for new buffer.
+ *  it might return -ENOMEM because of memory allocation errors,
+ *  and might return -EIO because of disk read errors.
+ */
+int nilfs_btnode_prepare_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh, *nbh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	int err;
+
+	if (oldkey == newkey)
+		return 0;
+
+	obh = ctxt->bh;
+	ctxt->newbh = NULL;
+
+	if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+		lock_page(obh->b_page);
+		/*
+		 * We cannot call radix_tree_preload for the kernels older
+		 * than 2.6.23, because it is not exported for modules.
+		 */
+		err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+		if (err)
+			goto failed_unlock;
+		/* BUG_ON(oldkey != obh->b_page->index); */
+		if (unlikely(oldkey != obh->b_page->index))
+			NILFS_PAGE_BUG(obh->b_page,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+
+retry:
+		spin_lock_irq(&btnc->tree_lock);
+		err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
+		spin_unlock_irq(&btnc->tree_lock);
+		/*
+		 * Note: page->index will not change to newkey until
+		 * nilfs_btnode_commit_change_key() will be called.
+		 * To protect the page in intermediate state, the page lock
+		 * is held.
+		 */
+		radix_tree_preload_end();
+		if (!err)
+			return 0;
+		else if (err != -EEXIST)
+			goto failed_unlock;
+
+		err = invalidate_inode_pages2_range(btnc, newkey, newkey);
+		if (!err)
+			goto retry;
+		/* fallback to copy mode */
+		unlock_page(obh->b_page);
+	}
+
+	err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
+	if (likely(!err)) {
+		BUG_ON(nbh == obh);
+		ctxt->newbh = nbh;
+	}
+	return err;
+
+ failed_unlock:
+	unlock_page(obh->b_page);
+	return err;
+}
+
+/**
+ * nilfs_btnode_commit_change_key
+ *  commit the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_commit_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	struct page *opage;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		opage = obh->b_page;
+		if (unlikely(oldkey != opage->index))
+			NILFS_PAGE_BUG(opage,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+		if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
+			BUG();
+
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, oldkey);
+		radix_tree_tag_set(&btnc->page_tree, newkey,
+				   PAGECACHE_TAG_DIRTY);
+		spin_unlock_irq(&btnc->tree_lock);
+
+		opage->index = obh->b_blocknr = newkey;
+		unlock_page(opage);
+	} else {
+		nilfs_copy_buffer(nbh, obh);
+		nilfs_btnode_mark_dirty(nbh);
+
+		nbh->b_blocknr = newkey;
+		ctxt->bh = nbh;
+		nilfs_btnode_delete(obh); /* will decrement bh->b_count */
+	}
+}
+
+/**
+ * nilfs_btnode_abort_change_key
+ *  abort the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_abort_change_key(struct address_space *btnc,
+				   struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, newkey);
+		spin_unlock_irq(&btnc->tree_lock);
+		unlock_page(ctxt->bh->b_page);
+	} else
+		brelse(nbh);
+}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 000000000000..35faa86444a7
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
+/*
+ * btnode.h - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_BTNODE_H
+#define _NILFS_BTNODE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+
+
+struct nilfs_btnode_chkey_ctxt {
+	__u64 oldkey;
+	__u64 newkey;
+	struct buffer_head *bh;
+	struct buffer_head *newbh;
+};
+
+void nilfs_btnode_cache_init_once(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_clear(struct address_space *);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+			      struct buffer_head **, int);
+int nilfs_btnode_get(struct address_space *, __u64, sector_t,
+		     struct buffer_head **, int);
+void nilfs_btnode_delete(struct buffer_head *);
+int nilfs_btnode_prepare_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_commit_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_abort_change_key(struct address_space *,
+				   struct nilfs_btnode_chkey_ctxt *);
+
+#define nilfs_btnode_mark_dirty(bh)	nilfs_mark_buffer_dirty(bh)
+
+
+#endif	/* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 000000000000..6b37a2767293
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
+/*
+ * btree.c - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "btnode.h"
+#include "btree.h"
+#include "alloc.h"
+
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+	struct buffer_head *bp_bh;
+	struct buffer_head *bp_sib_bh;
+	int bp_index;
+	union nilfs_bmap_ptr_req bp_oldreq;
+	union nilfs_bmap_ptr_req bp_newreq;
+	struct nilfs_btnode_chkey_ctxt bp_ctxt;
+	void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+		      int, __u64 *, __u64 *);
+};
+
+/*
+ * B-tree path operations
+ */
+
+static struct kmem_cache *nilfs_btree_path_cache;
+
+int __init nilfs_btree_path_cache_init(void)
+{
+	nilfs_btree_path_cache =
+		kmem_cache_create("nilfs2_btree_path_cache",
+				  sizeof(struct nilfs_btree_path) *
+				  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
+	return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
+}
+
+void nilfs_btree_path_cache_destroy(void)
+{
+	kmem_cache_destroy(nilfs_btree_path_cache);
+}
+
+static inline struct nilfs_btree_path *
+nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+{
+	return (struct nilfs_btree_path *)
+		kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+}
+
+static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
+					 struct nilfs_btree_path *path)
+{
+	kmem_cache_free(nilfs_btree_path_cache, path);
+}
+
+static void nilfs_btree_init_path(const struct nilfs_btree *btree,
+				  struct nilfs_btree_path *path)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_DATA;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++) {
+		path[level].bp_bh = NULL;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index = 0;
+		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_op = NULL;
+	}
+}
+
+static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_DATA;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++) {
+		if (path[level].bp_bh != NULL) {
+			nilfs_bmap_put_block(&btree->bt_bmap,
+					     path[level].bp_bh);
+			path[level].bp_bh = NULL;
+		}
+		/* sib_bh is released or deleted by prepare or commit
+		 * operations. */
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index = 0;
+		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_op = NULL;
+	}
+}
+
+
+/*
+ * B-tree node operations
+ */
+
+static inline int
+nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
+			   const struct nilfs_btree_node *node)
+{
+	return node->bn_flags;
+}
+
+static inline void
+nilfs_btree_node_set_flags(struct nilfs_btree *btree,
+			   struct nilfs_btree_node *node,
+			   int flags)
+{
+	node->bn_flags = flags;
+}
+
+static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
+					const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+}
+
+static inline int
+nilfs_btree_node_get_level(const struct nilfs_btree *btree,
+			   const struct nilfs_btree_node *node)
+{
+	return node->bn_level;
+}
+
+static inline void
+nilfs_btree_node_set_level(struct nilfs_btree *btree,
+			   struct nilfs_btree_node *node,
+			   int level)
+{
+	node->bn_level = level;
+}
+
+static inline int
+nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
+			       const struct nilfs_btree_node *node)
+{
+	return le16_to_cpu(node->bn_nchildren);
+}
+
+static inline void
+nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
+			       struct nilfs_btree_node *node,
+			       int nchildren)
+{
+	node->bn_nchildren = cpu_to_le16(nchildren);
+}
+
+static inline int
+nilfs_btree_node_size(const struct nilfs_btree *btree)
+{
+	return 1 << btree->bt_bmap.b_inode->i_blkbits;
+}
+
+static inline int
+nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
+			       const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_root(btree, node) ?
+		NILFS_BTREE_ROOT_NCHILDREN_MIN :
+		NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+}
+
+static inline int
+nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
+			       const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_root(btree, node) ?
+		NILFS_BTREE_ROOT_NCHILDREN_MAX :
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
+}
+
+static inline __le64 *
+nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
+		       const struct nilfs_btree_node *node)
+{
+	return (__le64 *)((char *)(node + 1) +
+			  (nilfs_btree_node_root(btree, node) ?
+			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
+}
+
+static inline __le64 *
+nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
+		       const struct nilfs_btree_node *node)
+{
+	return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
+			  nilfs_btree_node_nchildren_max(btree, node));
+}
+
+static inline __u64
+nilfs_btree_node_get_key(const struct nilfs_btree *btree,
+			 const struct nilfs_btree_node *node, int index)
+{
+	return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
+					index));
+}
+
+static inline void
+nilfs_btree_node_set_key(struct nilfs_btree *btree,
+			 struct nilfs_btree_node *node, int index, __u64 key)
+{
+	*(nilfs_btree_node_dkeys(btree, node) + index) =
+		nilfs_bmap_key_to_dkey(key);
+}
+
+static inline __u64
+nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
+			 const struct nilfs_btree_node *node,
+			 int index)
+{
+	return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+					index));
+}
+
+static inline void
+nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
+			 struct nilfs_btree_node *node,
+			 int index,
+			 __u64 ptr)
+{
+	*(nilfs_btree_node_dptrs(btree, node) + index) =
+		nilfs_bmap_ptr_to_dptr(ptr);
+}
+
+static void nilfs_btree_node_init(struct nilfs_btree *btree,
+				  struct nilfs_btree_node *node,
+				  int flags, int level, int nchildren,
+				  const __u64 *keys, const __u64 *ptrs)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int i;
+
+	nilfs_btree_node_set_flags(btree, node, flags);
+	nilfs_btree_node_set_level(btree, node, level);
+	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	for (i = 0; i < nchildren; i++) {
+		dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
+		dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
+	}
+}
+
+/* Assume the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
+				       struct nilfs_btree_node *left,
+				       struct nilfs_btree_node *right,
+				       int n)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(btree, left);
+	ldptrs = nilfs_btree_node_dptrs(btree, left);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+
+	rdkeys = nilfs_btree_node_dkeys(btree, right);
+	rdptrs = nilfs_btree_node_dptrs(btree, right);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+
+	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
+	memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
+	memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
+	memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
+
+	lnchildren += n;
+	rnchildren -= n;
+	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+
+/* Assume that the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
+					struct nilfs_btree_node *left,
+					struct nilfs_btree_node *right,
+					int n)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(btree, left);
+	ldptrs = nilfs_btree_node_dptrs(btree, left);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+
+	rdkeys = nilfs_btree_node_dkeys(btree, right);
+	rdptrs = nilfs_btree_node_dptrs(btree, right);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+
+	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
+	memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
+	memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
+	memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
+
+	lnchildren -= n;
+	rnchildren += n;
+	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_insert(struct nilfs_btree *btree,
+				    struct nilfs_btree_node *node,
+				    __u64 key, __u64 ptr, int index)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	if (index < nchildren) {
+		memmove(dkeys + index + 1, dkeys + index,
+			(nchildren - index) * sizeof(*dkeys));
+		memmove(dptrs + index + 1, dptrs + index,
+			(nchildren - index) * sizeof(*dptrs));
+	}
+	dkeys[index] = nilfs_bmap_key_to_dkey(key);
+	dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
+	nchildren++;
+	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_delete(struct nilfs_btree *btree,
+				    struct nilfs_btree_node *node,
+				    __u64 *keyp, __u64 *ptrp, int index)
+{
+	__u64 key;
+	__u64 ptr;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	key = nilfs_bmap_dkey_to_key(dkeys[index]);
+	ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	if (keyp != NULL)
+		*keyp = key;
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	if (index < nchildren - 1) {
+		memmove(dkeys + index, dkeys + index + 1,
+			(nchildren - index - 1) * sizeof(*dkeys));
+		memmove(dptrs + index, dptrs + index + 1,
+			(nchildren - index - 1) * sizeof(*dptrs));
+	}
+	nchildren--;
+	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+
+static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
+				   const struct nilfs_btree_node *node,
+				   __u64 key, int *indexp)
+{
+	__u64 nkey;
+	int index, low, high, s;
+
+	/* binary search */
+	low = 0;
+	high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	index = 0;
+	s = 0;
+	while (low <= high) {
+		index = (low + high) / 2;
+		nkey = nilfs_btree_node_get_key(btree, node, index);
+		if (nkey == key) {
+			s = 0;
+			goto out;
+		} else if (nkey < key) {
+			low = index + 1;
+			s = -1;
+		} else {
+			high = index - 1;
+			s = 1;
+		}
+	}
+
+	/* adjust index */
+	if (nilfs_btree_node_get_level(btree, node) >
+	    NILFS_BTREE_LEVEL_NODE_MIN) {
+		if ((s > 0) && (index > 0))
+			index--;
+	} else if (s < 0)
+		index++;
+
+ out:
+	*indexp = index;
+
+	return s == 0;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_btree *btree)
+{
+	return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
+			     const struct nilfs_btree_path *path,
+			     int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
+			 const struct nilfs_btree_path *path,
+			 int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
+}
+
+static inline int nilfs_btree_height(const struct nilfs_btree *btree)
+{
+	return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
+		+ 1;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_node(const struct nilfs_btree *btree,
+		     const struct nilfs_btree_path *path,
+		     int level)
+{
+	return (level == nilfs_btree_height(btree) - 1) ?
+		nilfs_btree_get_root(btree) :
+		nilfs_btree_get_nonroot_node(btree, path, level);
+}
+
+static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
+				 struct nilfs_btree_path *path,
+				 __u64 key, __u64 *ptrp, int minlevel)
+{
+	struct nilfs_btree_node *node;
+	__u64 ptr;
+	int level, index, found, ret;
+
+	node = nilfs_btree_get_root(btree);
+	level = nilfs_btree_node_get_level(btree, node);
+	if ((level < minlevel) ||
+	    (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+		return -ENOENT;
+
+	found = nilfs_btree_node_lookup(btree, node, key, &index);
+	ptr = nilfs_btree_node_get_ptr(btree, node, index);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+
+	for (level--; level >= minlevel; level--) {
+		ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+					   &path[level].bp_bh);
+		if (ret < 0)
+			return ret;
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+		if (!found)
+			found = nilfs_btree_node_lookup(btree, node, key,
+							&index);
+		else
+			index = 0;
+		if (index < nilfs_btree_node_nchildren_max(btree, node))
+			ptr = nilfs_btree_node_get_ptr(btree, node, index);
+		else {
+			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
+			/* insert */
+			ptr = NILFS_BMAP_INVALID_PTR;
+		}
+		path[level].bp_index = index;
+	}
+	if (!found)
+		return -ENOENT;
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	__u64 ptr;
+	int index, level, ret;
+
+	node = nilfs_btree_get_root(btree);
+	index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	if (index < 0)
+		return -ENOENT;
+	level = nilfs_btree_node_get_level(btree, node);
+	ptr = nilfs_btree_node_get_ptr(btree, node, index);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+
+	for (level--; level > 0; level--) {
+		ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+					   &path[level].bp_bh);
+		if (ret < 0)
+			return ret;
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+		index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+		ptr = nilfs_btree_node_get_ptr(btree, node, index);
+		path[level].bp_index = index;
+	}
+
+	if (keyp != NULL)
+		*keyp = nilfs_btree_node_get_key(btree, node, index);
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
+			      __u64 key, int level, __u64 *ptrp)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	__u64 ptr;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static void nilfs_btree_promote_key(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 key)
+{
+	if (level < nilfs_btree_height(btree) - 1) {
+		do {
+			lock_buffer(path[level].bp_bh);
+			nilfs_btree_node_set_key(
+				btree,
+				nilfs_btree_get_nonroot_node(
+					btree, path, level),
+				path[level].bp_index, key);
+			if (!buffer_dirty(path[level].bp_bh))
+				nilfs_btnode_mark_dirty(path[level].bp_bh);
+			unlock_buffer(path[level].bp_bh);
+		} while ((path[level].bp_index == 0) &&
+			 (++level < nilfs_btree_height(btree) - 1));
+	}
+
+	/* root */
+	if (level == nilfs_btree_height(btree) - 1) {
+		nilfs_btree_node_set_key(btree,
+					 nilfs_btree_get_root(btree),
+					 path[level].bp_index, key);
+	}
+}
+
+static void nilfs_btree_do_insert(struct nilfs_btree *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		lock_buffer(path[level].bp_bh);
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+					path[level].bp_index);
+		if (!buffer_dirty(path[level].bp_bh))
+			nilfs_btnode_mark_dirty(path[level].bp_bh);
+		unlock_buffer(path[level].bp_bh);
+
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+						nilfs_btree_node_get_key(
+							btree, node, 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+					path[level].bp_index);
+	}
+}
+
+static void nilfs_btree_carry_left(struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path,
+				   int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n, move;
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	left = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	move = 0;
+
+	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
+	if (n > path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_left(btree, left, node, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, node, 0));
+
+	if (move) {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index += lnchildren;
+		path[level + 1].bp_index--;
+	} else {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -= n;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_carry_right(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n, move;
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	move = 0;
+
+	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
+	if (n > nchildren - path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, right, 0));
+	path[level + 1].bp_index--;
+
+	if (move) {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -=
+			nilfs_btree_node_get_nchildren(btree, node);
+		path[level + 1].bp_index++;
+	} else {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_split(struct nilfs_btree *btree,
+			      struct nilfs_btree_path *path,
+			      int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	__u64 newkey;
+	__u64 newptr;
+	int nchildren, n, move;
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	move = 0;
+
+	n = (nchildren + 1) / 2;
+	if (n > nchildren - path[level].bp_index) {
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	newkey = nilfs_btree_node_get_key(btree, right, 0);
+	newptr = path[level].bp_newreq.bpr_ptr;
+
+	if (move) {
+		path[level].bp_index -=
+			nilfs_btree_node_get_nchildren(btree, node);
+		nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
+					path[level].bp_index);
+
+		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+	} else {
+		nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_grow(struct nilfs_btree *btree,
+			     struct nilfs_btree_path *path,
+			     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n;
+
+	lock_buffer(path[level].bp_sib_bh);
+
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_sib_node(btree, path, level);
+
+	n = nilfs_btree_node_get_nchildren(btree, root);
+
+	nilfs_btree_node_move_right(btree, root, child, n);
+	nilfs_btree_node_set_level(btree, root, level + 1);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_sib_bh);
+
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+	*keyp = nilfs_btree_node_get_key(btree, child, 0);
+	*ptrp = path[level].bp_newreq.bpr_ptr;
+}
+
+static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
+				   const struct nilfs_btree_path *path)
+{
+	struct nilfs_btree_node *node;
+	int level;
+
+	if (path == NULL)
+		return NILFS_BMAP_INVALID_PTR;
+
+	/* left sibling */
+	level = NILFS_BTREE_LEVEL_NODE_MIN;
+	if (path[level].bp_index > 0) {
+		node = nilfs_btree_get_node(btree, path, level);
+		return nilfs_btree_node_get_ptr(btree, node,
+						path[level].bp_index - 1);
+	}
+
+	/* parent */
+	level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
+	if (level <= nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_node(btree, path, level);
+		return nilfs_btree_node_get_ptr(btree, node,
+						path[level].bp_index);
+	}
+
+	return NILFS_BMAP_INVALID_PTR;
+}
+
+static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
+				       const struct nilfs_btree_path *path,
+				       __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else {
+		ptr = nilfs_btree_find_near(btree, path);
+		if (ptr != NILFS_BMAP_INVALID_PTR)
+			/* near */
+			return ptr;
+	}
+	/* block group */
+	return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
+}
+
+static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
+				     __u64 ptr)
+{
+	btree->bt_bmap.b_last_allocated_key = key;
+	btree->bt_bmap.b_last_allocated_ptr = ptr;
+}
+
+static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp, __u64 key, __u64 ptr,
+				      struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, level, ret;
+
+	stats->bs_nblocks = 0;
+	level = NILFS_BTREE_LEVEL_DATA;
+
+	/* allocate a new ptr for data block */
+	if (btree->bt_ops->btop_find_target != NULL)
+		path[level].bp_newreq.bpr_ptr =
+			btree->bt_ops->btop_find_target(btree, path, key);
+
+	ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+		&btree->bt_bmap, &path[level].bp_newreq);
+	if (ret < 0)
+		goto err_out_data;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		if (nilfs_btree_node_get_nchildren(btree, node) <
+		    nilfs_btree_node_nchildren_max(btree, node)) {
+			path[level].bp_op = nilfs_btree_do_insert;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1);
+		pindex = path[level + 1].bp_index;
+
+		/* left sibling */
+		if (pindex > 0) {
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex - 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) <
+			    nilfs_btree_node_nchildren_max(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else
+				nilfs_bmap_put_block(&btree->bt_bmap, bh);
+		}
+
+		/* right sibling */
+		if (pindex <
+		    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex + 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) <
+			    nilfs_btree_node_nchildren_max(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else
+				nilfs_bmap_put_block(&btree->bt_bmap, bh);
+		}
+
+		/* split */
+		path[level].bp_newreq.bpr_ptr =
+			path[level - 1].bp_newreq.bpr_ptr + 1;
+		ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+			&btree->bt_bmap, &path[level].bp_newreq);
+		if (ret < 0)
+			goto err_out_child_node;
+		ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+					       path[level].bp_newreq.bpr_ptr,
+					       &bh);
+		if (ret < 0)
+			goto err_out_curr_node;
+
+		stats->bs_nblocks++;
+
+		lock_buffer(bh);
+		nilfs_btree_node_init(btree,
+				      (struct nilfs_btree_node *)bh->b_data,
+				      0, level, 0, NULL, NULL);
+		unlock_buffer(bh);
+		path[level].bp_sib_bh = bh;
+		path[level].bp_op = nilfs_btree_split;
+	}
+
+	/* root */
+	node = nilfs_btree_get_root(btree);
+	if (nilfs_btree_node_get_nchildren(btree, node) <
+	    nilfs_btree_node_nchildren_max(btree, node)) {
+		path[level].bp_op = nilfs_btree_do_insert;
+		stats->bs_nblocks++;
+		goto out;
+	}
+
+	/* grow */
+	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
+	ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+		&btree->bt_bmap, &path[level].bp_newreq);
+	if (ret < 0)
+		goto err_out_child_node;
+	ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+				       path[level].bp_newreq.bpr_ptr, &bh);
+	if (ret < 0)
+		goto err_out_curr_node;
+
+	lock_buffer(bh);
+	nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
+			      0, level, 0, NULL, NULL);
+	unlock_buffer(bh);
+	path[level].bp_sib_bh = bh;
+	path[level].bp_op = nilfs_btree_grow;
+
+	level++;
+	path[level].bp_op = nilfs_btree_do_insert;
+
+	/* a newly-created node block and a data block are added */
+	stats->bs_nblocks += 2;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+						    &path[level].bp_newreq);
+ err_out_child_node:
+	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
+		nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
+			&btree->bt_bmap, &path[level].bp_newreq);
+
+	}
+
+	btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+						       &path[level].bp_newreq);
+ err_out_data:
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel, __u64 key, __u64 ptr)
+{
+	int level;
+
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
+	if (btree->bt_ops->btop_set_target != NULL)
+		btree->bt_ops->btop_set_target(btree, key, ptr);
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
+			btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
+				&btree->bt_bmap, &path[level - 1].bp_newreq);
+		}
+		path[level].bp_op(btree, path, level, &key, &ptr);
+	}
+
+	if (!nilfs_bmap_dirty(&btree->bt_bmap))
+		nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+
+static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	int level, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN);
+	if (ret != -ENOENT) {
+		if (ret == 0)
+			ret = -EEXIST;
+		goto out;
+	}
+
+	ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_insert(btree, path, level, key, ptr);
+	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+	return ret;
+}
+
+static void nilfs_btree_do_delete(struct nilfs_btree *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		lock_buffer(path[level].bp_bh);
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		nilfs_btree_node_delete(btree, node, keyp, ptrp,
+					path[level].bp_index);
+		if (!buffer_dirty(path[level].bp_bh))
+			nilfs_btnode_mark_dirty(path[level].bp_bh);
+		unlock_buffer(path[level].bp_bh);
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, node, 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_delete(btree, node, keyp, ptrp,
+					path[level].bp_index);
+	}
+}
+
+static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	left = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+
+	n = (nchildren + lnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_right(btree, left, node, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, node, 0));
+
+	nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += n;
+}
+
+static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+
+	n = (nchildren + rnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_left(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, right, 0));
+	path[level + 1].bp_index--;
+
+	nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+}
+
+static void nilfs_btree_concat_left(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	left = nilfs_btree_get_sib_node(btree, path, level);
+
+	n = nilfs_btree_node_get_nchildren(btree, node);
+
+	nilfs_btree_node_move_left(btree, left, node, n);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+}
+
+static void nilfs_btree_concat_right(struct nilfs_btree *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+
+	n = nilfs_btree_node_get_nchildren(btree, right);
+
+	nilfs_btree_node_move_left(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_shrink(struct nilfs_btree *btree,
+			       struct nilfs_btree_path *path,
+			       int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_nonroot_node(btree, path, level);
+
+	nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
+	nilfs_btree_node_set_level(btree, root, level);
+	n = nilfs_btree_node_get_nchildren(btree, child);
+	nilfs_btree_node_move_left(btree, root, child, n);
+	unlock_buffer(path[level].bp_bh);
+
+	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+	path[level].bp_bh = NULL;
+}
+
+
+static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp,
+				      struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, level, ret;
+
+	ret = 0;
+	stats->bs_nblocks = 0;
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		path[level].bp_oldreq.bpr_ptr =
+			nilfs_btree_node_get_ptr(btree, node,
+						 path[level].bp_index);
+		if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+			ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+				&btree->bt_bmap, &path[level].bp_oldreq);
+			if (ret < 0)
+				goto err_out_child_node;
+		}
+
+		if (nilfs_btree_node_get_nchildren(btree, node) >
+		    nilfs_btree_node_nchildren_min(btree, node)) {
+			path[level].bp_op = nilfs_btree_do_delete;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1);
+		pindex = path[level + 1].bp_index;
+
+		if (pindex > 0) {
+			/* left sibling */
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex - 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) >
+			    nilfs_btree_node_nchildren_min(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_left;
+				stats->bs_nblocks++;
+				/* continue; */
+			}
+		} else if (pindex <
+			   nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+			/* right sibling */
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex + 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) >
+			    nilfs_btree_node_nchildren_min(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_right;
+				stats->bs_nblocks++;
+				/* continue; */
+			}
+		} else {
+			/* no siblings */
+			/* the only child of the root node */
+			WARN_ON(level != nilfs_btree_height(btree) - 2);
+			if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+			    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+				path[level].bp_op = nilfs_btree_shrink;
+				stats->bs_nblocks += 2;
+			} else {
+				path[level].bp_op = nilfs_btree_do_delete;
+				stats->bs_nblocks++;
+			}
+
+			goto out;
+
+		}
+	}
+
+	node = nilfs_btree_get_root(btree);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
+	if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+		ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+			&btree->bt_bmap, &path[level].bp_oldreq);
+		if (ret < 0)
+			goto err_out_child_node;
+	}
+	/* child of the root node is deleted */
+	path[level].bp_op = nilfs_btree_do_delete;
+	stats->bs_nblocks++;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+		btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+			&btree->bt_bmap, &path[level].bp_oldreq);
+ err_out_child_node:
+	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+			btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+				&btree->bt_bmap, &path[level].bp_oldreq);
+	}
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
+			btree->bt_bmap.b_pops->bpop_commit_end_ptr(
+				&btree->bt_bmap, &path[level].bp_oldreq);
+		path[level].bp_op(btree, path, level, NULL, NULL);
+	}
+
+	if (!nilfs_bmap_dirty(&btree->bt_bmap))
+		nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+
+static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
+
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	int level, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN);
+	if (ret < 0)
+		goto out;
+
+	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_delete(btree, path, level);
+	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+
+out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+	return ret;
+}
+
+static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
+
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *root, *node;
+	__u64 maxkey, nextmaxkey;
+	__u64 ptr;
+	int nchildren, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		if (nchildren > 1)
+			return 0;
+		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+		ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		break;
+	default:
+		return 0;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+	nextmaxkey = (nchildren > 1) ?
+		nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+	if (bh != NULL)
+		nilfs_bmap_put_block(bmap, bh);
+
+	return (maxkey == key) && (nextmaxkey < bmap->b_low);
+}
+
+static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
+				   __u64 *keys, __u64 *ptrs, int nitems)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *node, *root;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	__u64 ptr;
+	int nchildren, i, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		WARN_ON(nchildren > 1);
+		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+		ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		break;
+	default:
+		node = NULL;
+		return -EINVAL;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	if (nchildren < nitems)
+		nitems = nchildren;
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	for (i = 0; i < nitems; i++) {
+		keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
+		ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
+	}
+
+	if (bh != NULL)
+		nilfs_bmap_put_block(bmap, bh);
+
+	return nitems;
+}
+
+static int
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
+				       union nilfs_bmap_ptr_req *dreq,
+				       union nilfs_bmap_ptr_req *nreq,
+				       struct buffer_head **bhp,
+				       struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	stats->bs_nblocks = 0;
+
+	/* for data */
+	/* cannot find near ptr */
+	if (btree->bt_ops->btop_find_target != NULL)
+		dreq->bpr_ptr
+			= btree->bt_ops->btop_find_target(btree, NULL, key);
+	ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+	if (ret < 0)
+		return ret;
+
+	*bhp = NULL;
+	stats->bs_nblocks++;
+	if (nreq != NULL) {
+		nreq->bpr_ptr = dreq->bpr_ptr + 1;
+		ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+		if (ret < 0)
+			goto err_out_dreq;
+
+		ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+		if (ret < 0)
+			goto err_out_nreq;
+
+		*bhp = bh;
+		stats->bs_nblocks++;
+	}
+
+	/* success */
+	return 0;
+
+	/* error */
+ err_out_nreq:
+	bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+ err_out_dreq:
+	bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+	stats->bs_nblocks = 0;
+	return ret;
+
+}
+
+static void
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
+				      __u64 key, __u64 ptr,
+				      const __u64 *keys, const __u64 *ptrs,
+				      int n, __u64 low, __u64 high,
+				      union nilfs_bmap_ptr_req *dreq,
+				      union nilfs_bmap_ptr_req *nreq,
+				      struct buffer_head *bh)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *node;
+	__u64 tmpptr;
+
+	/* free resources */
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+
+	/* ptr must be a pointer to a buffer head. */
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+
+	/* convert and insert */
+	btree = (struct nilfs_btree *)bmap;
+	nilfs_btree_init(bmap, low, high);
+	if (nreq != NULL) {
+		if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
+			bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+			bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
+		}
+
+		/* create child node at level 1 */
+		lock_buffer(bh);
+		node = (struct nilfs_btree_node *)bh->b_data;
+		nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
+		nilfs_btree_node_insert(btree, node,
+					key, dreq->bpr_ptr, n);
+		if (!buffer_dirty(bh))
+			nilfs_btnode_mark_dirty(bh);
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
+
+		unlock_buffer(bh);
+		nilfs_bmap_put_block(bmap, bh);
+
+		/* create root node at level 2 */
+		node = nilfs_btree_get_root(btree);
+		tmpptr = nreq->bpr_ptr;
+		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+				      2, 1, &keys[0], &tmpptr);
+	} else {
+		if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
+			bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+
+		/* create root node at level 1 */
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+				      1, n, keys, ptrs);
+		nilfs_btree_node_insert(btree, node,
+					key, dreq->bpr_ptr, n);
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
+	}
+
+	if (btree->bt_ops->btop_set_target != NULL)
+		btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+}
+
+/**
+ * nilfs_btree_convert_and_insert -
+ * @bmap:
+ * @key:
+ * @ptr:
+ * @keys:
+ * @ptrs:
+ * @n:
+ * @low:
+ * @high:
+ */
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
+				   __u64 key, __u64 ptr,
+				   const __u64 *keys, const __u64 *ptrs,
+				   int n, __u64 low, __u64 high)
+{
+	struct buffer_head *bh;
+	union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+		di = &dreq;
+		ni = NULL;
+	} else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
+			   1 << bmap->b_inode->i_blkbits)) {
+		di = &dreq;
+		ni = &nreq;
+	} else {
+		di = NULL;
+		ni = NULL;
+		BUG();
+	}
+
+	ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
+						     &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
+					      low, high, di, ni, bh);
+	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+	return 0;
+}
+
+static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path,
+				   int level,
+				   struct buffer_head *bh)
+{
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+
+	return 0;
+}
+
+static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
+					struct nilfs_btree_path *path,
+					int level)
+{
+	struct nilfs_btree_node *parent;
+	int ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(btree, parent,
+					 path[level + 1].bp_index);
+	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
+	ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
+					&path[level].bp_oldreq,
+					&path[level].bp_newreq);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
+		path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
+		path[level].bp_ctxt.bh = path[level].bp_bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0) {
+			nilfs_bmap_abort_update(&btree->bt_bmap,
+						&path[level].bp_oldreq,
+						&path[level].bp_newreq);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
+					struct nilfs_btree_path *path,
+					int level)
+{
+	struct nilfs_btree_node *parent;
+
+	nilfs_bmap_commit_update(&btree->bt_bmap,
+				 &path[level].bp_oldreq,
+				 &path[level].bp_newreq);
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		path[level].bp_bh = path[level].bp_ctxt.bh;
+	}
+	set_buffer_nilfs_volatile(path[level].bp_bh);
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
+				 path[level].bp_newreq.bpr_ptr);
+}
+
+static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
+				       struct nilfs_btree_path *path,
+				       int level)
+{
+	nilfs_bmap_abort_update(&btree->bt_bmap,
+				&path[level].bp_oldreq,
+				&path[level].bp_newreq);
+	if (buffer_nilfs_node(path[level].bp_bh))
+		nilfs_btnode_abort_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+}
+
+static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel,
+					   int *maxlevelp)
+{
+	int level, ret;
+
+	level = minlevel;
+	if (!buffer_nilfs_volatile(path[level].bp_bh)) {
+		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		if (ret < 0)
+			return ret;
+	}
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh)) {
+
+		WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
+		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* success */
+	*maxlevelp = level - 1;
+	return 0;
+
+	/* error */
+ out:
+	while (--level > minlevel)
+		nilfs_btree_abort_update_v(btree, path, level);
+	if (!buffer_nilfs_volatile(path[level].bp_bh))
+		nilfs_btree_abort_update_v(btree, path, level);
+	return ret;
+}
+
+static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel,
+					   int maxlevel,
+					   struct buffer_head *bh)
+{
+	int level;
+
+	if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
+		nilfs_btree_commit_update_v(btree, path, minlevel);
+
+	for (level = minlevel + 1; level <= maxlevel; level++)
+		nilfs_btree_commit_update_v(btree, path, level);
+}
+
+static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path,
+				   int level,
+				   struct buffer_head *bh)
+{
+	int maxlevel, ret;
+	struct nilfs_btree_node *parent;
+	__u64 ptr;
+
+	get_bh(bh);
+	path[level].bp_bh = bh;
+	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+	if (ret < 0)
+		goto out;
+
+	if (buffer_nilfs_volatile(path[level].bp_bh)) {
+		parent = nilfs_btree_get_node(btree, path, level + 1);
+		ptr = nilfs_btree_node_get_ptr(btree, parent,
+					       path[level + 1].bp_index);
+		ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+		if (ret < 0)
+			goto out;
+	}
+
+	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+
+ out:
+	brelse(path[level].bp_bh);
+	path[level].bp_bh = NULL;
+	return ret;
+}
+
+static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
+				 struct buffer_head *bh)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	WARN_ON(!buffer_dirty(bh));
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	if (buffer_nilfs_node(bh)) {
+		node = (struct nilfs_btree_node *)bh->b_data;
+		key = nilfs_btree_node_get_key(btree, node, 0);
+		level = nilfs_btree_node_get_level(btree, node);
+	} else {
+		key = nilfs_bmap_data_get_key(bmap, bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+	if (ret < 0) {
+		if (unlikely(ret == -ENOENT))
+			printk(KERN_CRIT "%s: key = %llu, level == %d\n",
+			       __func__, (unsigned long long)key, level);
+		goto out;
+	}
+
+	ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
+				    struct buffer_head *bh)
+{
+	return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+}
+
+static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
+					 struct list_head *lists,
+					 struct buffer_head *bh)
+{
+	struct list_head *head;
+	struct buffer_head *cbh;
+	struct nilfs_btree_node *node, *cnode;
+	__u64 key, ckey;
+	int level;
+
+	get_bh(bh);
+	node = (struct nilfs_btree_node *)bh->b_data;
+	key = nilfs_btree_node_get_key(btree, node, 0);
+	level = nilfs_btree_node_get_level(btree, node);
+	list_for_each(head, &lists[level]) {
+		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
+		cnode = (struct nilfs_btree_node *)cbh->b_data;
+		ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+		if (key < ckey)
+			break;
+	}
+	list_add_tail(&bh->b_assoc_buffers, head);
+}
+
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+					     struct list_head *listp)
+{
+	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+	struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
+	struct list_head lists[NILFS_BTREE_LEVEL_MAX];
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	pgoff_t index = 0;
+	int level, i;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		INIT_LIST_HEAD(&lists[level]);
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh))
+					nilfs_btree_add_dirty_buffer(btree,
+								     lists, bh);
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		list_splice(&lists[level], listp->prev);
+}
+
+static int nilfs_btree_assign_p(struct nilfs_btree *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	__u64 key;
+	__u64 ptr;
+	int ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	ptr = nilfs_btree_node_get_ptr(btree, parent,
+				       path[level + 1].bp_index);
+	if (buffer_nilfs_node(*bh)) {
+		path[level].bp_ctxt.oldkey = ptr;
+		path[level].bp_ctxt.newkey = blocknr;
+		path[level].bp_ctxt.bh = *bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0)
+			return ret;
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		*bh = path[level].bp_ctxt.bh;
+	}
+
+	nilfs_btree_node_set_ptr(btree, parent,
+				 path[level + 1].bp_index, blocknr);
+
+	key = nilfs_btree_node_get_key(btree, parent,
+				       path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_dat.bi_level = level;
+
+	return 0;
+}
+
+static int nilfs_btree_assign_v(struct nilfs_btree *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	__u64 key;
+	__u64 ptr;
+	union nilfs_bmap_ptr_req req;
+	int ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	ptr = nilfs_btree_node_get_ptr(btree, parent,
+				       path[level + 1].bp_index);
+	req.bpr_ptr = ptr;
+	ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
+							       &req);
+	if (ret < 0)
+		return ret;
+	btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
+							&req, blocknr);
+
+	key = nilfs_btree_node_get_key(btree, parent,
+				       path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+
+	return 0;
+}
+
+static int nilfs_btree_assign(struct nilfs_bmap *bmap,
+			      struct buffer_head **bh,
+			      sector_t blocknr,
+			      union nilfs_binfo *binfo)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(btree, node, 0);
+		level = nilfs_btree_node_get_level(btree, node);
+	} else {
+		key = nilfs_bmap_data_get_key(bmap, *bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	ret = btree->bt_ops->btop_assign(btree, path, level, bh,
+					    blocknr, binfo);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(btree, node, 0);
+	} else
+		key = nilfs_bmap_data_get_key(bmap, *bh);
+
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
+	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+
+	return 0;
+}
+
+static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	__u64 ptr;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+	ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	if (!buffer_dirty(bh))
+		nilfs_btnode_mark_dirty(bh);
+	nilfs_bmap_put_block(&btree->bt_bmap, bh);
+	if (!nilfs_bmap_dirty(&btree->bt_bmap))
+		nilfs_bmap_set_dirty(&btree->bt_bmap);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+	return ret;
+}
+
+static const struct nilfs_bmap_operations nilfs_btree_ops = {
+	.bop_lookup		=	nilfs_btree_lookup,
+	.bop_insert		=	nilfs_btree_insert,
+	.bop_delete		=	nilfs_btree_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign,
+	.bop_mark		=	nilfs_btree_mark,
+
+	.bop_last_key		=	nilfs_btree_last_key,
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	nilfs_btree_check_delete,
+	.bop_gather_data	=	nilfs_btree_gather_data,
+};
+
+static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
+	.bop_lookup		=	NULL,
+	.bop_insert		=	NULL,
+	.bop_delete		=	NULL,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate_gc,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign_gc,
+	.bop_mark		=	NULL,
+
+	.bop_last_key		=	NULL,
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	NULL,
+};
+
+static const struct nilfs_btree_operations nilfs_btree_ops_v = {
+	.btop_find_target	=	nilfs_btree_find_target_v,
+	.btop_set_target	=	nilfs_btree_set_target_v,
+	.btop_propagate		=	nilfs_btree_propagate_v,
+	.btop_assign		=	nilfs_btree_assign_v,
+};
+
+static const struct nilfs_btree_operations nilfs_btree_ops_p = {
+	.btop_find_target	=	NULL,
+	.btop_set_target	=	NULL,
+	.btop_propagate		=	nilfs_btree_propagate_p,
+	.btop_assign		=	nilfs_btree_assign_p,
+};
+
+int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+	struct nilfs_btree *btree;
+
+	btree = (struct nilfs_btree *)bmap;
+	bmap->b_ops = &nilfs_btree_ops;
+	bmap->b_low = low;
+	bmap->b_high = high;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		btree->bt_ops = &nilfs_btree_ops_p;
+		break;
+	default:
+		btree->bt_ops = &nilfs_btree_ops_v;
+		break;
+	}
+
+	return 0;
+}
+
+void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
+{
+	bmap->b_low = NILFS_BMAP_LARGE_LOW;
+	bmap->b_high = NILFS_BMAP_LARGE_HIGH;
+	bmap->b_ops = &nilfs_btree_ops_gc;
+}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 000000000000..4766deb52fb1
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
+/*
+ * btree.h - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BTREE_H
+#define _NILFS_BTREE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/list.h>
+#include <linux/nilfs2_fs.h>
+#include "btnode.h"
+#include "bmap.h"
+
+struct nilfs_btree;
+struct nilfs_btree_path;
+
+/**
+ * struct nilfs_btree_operations - B-tree operation table
+ */
+struct nilfs_btree_operations {
+	__u64 (*btop_find_target)(const struct nilfs_btree *,
+				  const struct nilfs_btree_path *, __u64);
+	void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
+
+	struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
+
+	int (*btop_propagate)(struct nilfs_btree *,
+			      struct nilfs_btree_path *,
+			      int,
+			      struct buffer_head *);
+	int (*btop_assign)(struct nilfs_btree *,
+			   struct nilfs_btree_path *,
+			   int,
+			   struct buffer_head **,
+			   sector_t,
+			   union nilfs_binfo *);
+};
+
+/**
+ * struct nilfs_btree_node - B-tree node
+ * @bn_flags: flags
+ * @bn_level: level
+ * @bn_nchildren: number of children
+ * @bn_pad: padding
+ */
+struct nilfs_btree_node {
+	__u8 bn_flags;
+	__u8 bn_level;
+	__le16 bn_nchildren;
+	__le32 bn_pad;
+};
+
+/* flags */
+#define NILFS_BTREE_NODE_ROOT	0x01
+
+/* level */
+#define NILFS_BTREE_LEVEL_DATA		0
+#define NILFS_BTREE_LEVEL_NODE_MIN	(NILFS_BTREE_LEVEL_DATA + 1)
+#define NILFS_BTREE_LEVEL_MAX		14
+
+/**
+ * struct nilfs_btree - B-tree structure
+ * @bt_bmap: bmap base structure
+ * @bt_ops: B-tree operation table
+ */
+struct nilfs_btree {
+	struct nilfs_bmap bt_bmap;
+
+	/* B-tree-specific members */
+	const struct nilfs_btree_operations *bt_ops;
+};
+
+
+#define NILFS_BTREE_ROOT_SIZE		NILFS_BMAP_SIZE
+#define NILFS_BTREE_ROOT_NCHILDREN_MAX					\
+	((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) /	\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_ROOT_NCHILDREN_MIN	0
+#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE	(sizeof(__le64))
+#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize)			\
+	(((nodesize) - sizeof(struct nilfs_btree_node) -		\
+		NILFS_BTREE_NODE_EXTRA_PAD_SIZE) /			\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize)			\
+	((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
+#define NILFS_BTREE_KEY_MIN	((__u64)0)
+#define NILFS_BTREE_KEY_MAX	(~(__u64)0)
+
+
+int nilfs_btree_path_cache_init(void);
+void nilfs_btree_path_cache_destroy(void);
+int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
+				   const __u64 *, const __u64 *,
+				   int, __u64, __u64);
+void nilfs_btree_init_gc(struct nilfs_bmap *);
+
+#endif	/* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000000..e90b60dfced9
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,925 @@
+/*
+ * cpfile.c - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "cpfile.h"
+
+
+static inline unsigned long
+nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
+{
+	return NILFS_MDT(cpfile)->mi_entries_per_block;
+}
+
+/* block number from the beginning of the file */
+static unsigned long
+nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+	return (unsigned long)tcno;
+}
+
+/* offset in block */
+static unsigned long
+nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+}
+
+static unsigned long
+nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
+				  __u64 curr,
+				  __u64 max)
+{
+	return min_t(__u64,
+		     nilfs_cpfile_checkpoints_per_block(cpfile) -
+		     nilfs_cpfile_get_offset(cpfile, curr),
+		     max - curr);
+}
+
+static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
+					   __u64 cno)
+{
+	return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
+}
+
+static unsigned int
+nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	count = le32_to_cpu(cp->cp_checkpoints_count) + n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static unsigned int
+nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
+	count = le32_to_cpu(cp->cp_checkpoints_count) - n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static inline struct nilfs_cpfile_header *
+nilfs_cpfile_block_get_header(const struct inode *cpfile,
+			      struct buffer_head *bh,
+			      void *kaddr)
+{
+	return kaddr + bh_offset(bh);
+}
+
+static struct nilfs_checkpoint *
+nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
+				  struct buffer_head *bh,
+				  void *kaddr)
+{
+	return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
+		NILFS_MDT(cpfile)->mi_entry_size;
+}
+
+static void nilfs_cpfile_block_init(struct inode *cpfile,
+				    struct buffer_head *bh,
+				    void *kaddr)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	int n = nilfs_cpfile_checkpoints_per_block(cpfile);
+
+	while (n-- > 0) {
+		nilfs_checkpoint_set_invalid(cp);
+		cp = (void *)cp + cpsz;
+	}
+}
+
+static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+}
+
+static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
+						    __u64 cno,
+						    int create,
+						    struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile,
+				   nilfs_cpfile_get_blkoff(cpfile, cno),
+				   create, nilfs_cpfile_block_init, bhp);
+}
+
+static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
+						       __u64 cno)
+{
+	return nilfs_mdt_delete_block(cpfile,
+				      nilfs_cpfile_get_blkoff(cpfile, cno));
+}
+
+/**
+ * nilfs_cpfile_get_checkpoint - get a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @create: create flag
+ * @cpp: pointer to a checkpoint
+ * @bhp: pointer to a buffer head
+ *
+ * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
+ * specified by @cno. A new checkpoint will be created if @cno is the current
+ * checkpoint number and @create is nonzero.
+ *
+ * Return Value: On success, 0 is returned, and the checkpoint and the
+ * buffer head of the buffer on which the checkpoint is located are stored in
+ * the place pointed by @cpp and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ *
+ * %-EINVAL - invalid checkpoint.
+ */
+int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
+				__u64 cno,
+				int create,
+				struct nilfs_checkpoint **cpp,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
+		     (cno < nilfs_mdt_cno(cpfile) && create)))
+		return -EINVAL;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
+	if (ret < 0)
+		goto out_header;
+	kaddr = kmap(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		if (!create) {
+			kunmap(cp_bh->b_page);
+			brelse(cp_bh);
+			ret = -ENOENT;
+			goto out_header;
+		}
+		/* a newly-created checkpoint */
+		nilfs_checkpoint_clear_invalid(cp);
+		if (!nilfs_cpfile_is_in_first(cpfile, cno))
+			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+								 kaddr, 1);
+		nilfs_mdt_mark_buffer_dirty(cp_bh);
+
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, 1);
+		kunmap_atomic(kaddr, KM_USER0);
+		nilfs_mdt_mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+	}
+
+	if (cpp != NULL)
+		*cpp = cp;
+	*bhp = cp_bh;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_put_checkpoint - put a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @bh: buffer head
+ *
+ * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
+ * specified by @cno. @bh must be the buffer head which has been returned by
+ * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ */
+void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct buffer_head *bh)
+{
+	kunmap(bh->b_page);
+	brelse(bh);
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoints - delete checkpoints
+ * @cpfile: inode of checkpoint file
+ * @start: start checkpoint number
+ * @end: end checkpoint numer
+ *
+ * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
+ * the period from @start to @end, excluding @end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
+int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
+				    __u64 start,
+				    __u64 end)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cno;
+	void *kaddr;
+	unsigned long tnicps;
+	int ret, ncps, nicps, count, i;
+
+	if (unlikely(start == 0 || start > end)) {
+		printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
+		       "[%llu, %llu)\n", __func__,
+		       (unsigned long long)start, (unsigned long long)end);
+		return -EINVAL;
+	}
+
+	/* cannot delete the latest checkpoint */
+	if (start == nilfs_mdt_cno(cpfile) - 1)
+		return -EPERM;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	tnicps = 0;
+
+	for (cno = start; cno < end; cno += ncps) {
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out_sem;
+			/* skip hole */
+			ret = 0;
+			continue;
+		}
+
+		kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, cno, cp_bh, kaddr);
+		nicps = 0;
+		for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
+			WARN_ON(nilfs_checkpoint_snapshot(cp));
+			if (!nilfs_checkpoint_invalid(cp)) {
+				nilfs_checkpoint_set_invalid(cp);
+				nicps++;
+			}
+		}
+		if (nicps > 0) {
+			tnicps += nicps;
+			nilfs_mdt_mark_buffer_dirty(cp_bh);
+			nilfs_mdt_mark_dirty(cpfile);
+			if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
+			    (count = nilfs_cpfile_block_sub_valid_checkpoints(
+				    cpfile, cp_bh, kaddr, nicps)) == 0) {
+				/* make hole */
+				kunmap_atomic(kaddr, KM_USER0);
+				brelse(cp_bh);
+				ret = nilfs_cpfile_delete_checkpoint_block(
+					cpfile, cno);
+				if (ret == 0)
+					continue;
+				printk(KERN_ERR "%s: cannot delete block\n",
+				       __func__);
+				goto out_sem;
+			}
+		}
+
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(cp_bh);
+	}
+
+	if (tnicps > 0) {
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
+		nilfs_mdt_mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
+					      struct nilfs_checkpoint *cp,
+					      struct nilfs_cpinfo *ci)
+{
+	ci->ci_flags = le32_to_cpu(cp->cp_flags);
+	ci->ci_cno = le64_to_cpu(cp->cp_cno);
+	ci->ci_create = le64_to_cpu(cp->cp_create);
+	ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
+	ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
+	ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
+	ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+}
+
+static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
+					  struct nilfs_cpinfo *ci, size_t nci)
+{
+	struct nilfs_checkpoint *cp;
+	struct buffer_head *bh;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
+	void *kaddr;
+	int n, ret;
+	int ncps, i;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out;
+			continue; /* skip hole */
+		}
+
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
+			if (!nilfs_checkpoint_invalid(cp))
+				nilfs_cpfile_checkpoint_to_cpinfo(
+					cpfile, cp, &ci[n++]);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(bh);
+	}
+
+	ret = n;
+	if (n > 0)
+		*cnop = ci[n - 1].ci_cno + 1;
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
+					  struct nilfs_cpinfo *ci, size_t nci)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	__u64 curr = *cnop, next;
+	unsigned long curr_blkoff, next_blkoff;
+	void *kaddr;
+	int n = 0, ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	if (curr == 0) {
+		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+		if (ret < 0)
+			goto out;
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(bh);
+		if (curr == 0) {
+			ret = 0;
+			goto out;
+		}
+	} else if (unlikely(curr == ~(__u64)0)) {
+		ret = 0;
+		goto out;
+	}
+
+	curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			ret = 0; /* No snapshots (started from a hole block) */
+		goto out;
+	}
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	while (n < nci) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
+		curr = ~(__u64)0; /* Terminator */
+		if (unlikely(nilfs_checkpoint_invalid(cp) ||
+			     !nilfs_checkpoint_snapshot(cp)))
+			break;
+		nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+		next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+		if (next == 0)
+			break; /* reach end of the snapshot list */
+
+		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
+		if (curr_blkoff != next_blkoff) {
+			kunmap_atomic(kaddr, KM_USER0);
+			brelse(bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
+								0, &bh);
+			if (unlikely(ret < 0)) {
+				WARN_ON(ret == -ENOENT);
+				goto out;
+			}
+			kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		}
+		curr = next;
+		curr_blkoff = next_blkoff;
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+	*cnop = curr;
+	ret = n;
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_get_cpinfo -
+ * @cpfile:
+ * @cno:
+ * @ci:
+ * @nci:
+ */
+
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
+				struct nilfs_cpinfo *ci, size_t nci)
+{
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoint -
+ * @cpfile:
+ * @cno:
+ */
+int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
+{
+	struct nilfs_cpinfo ci;
+	__u64 tcno = cno;
+	ssize_t nci;
+	int ret;
+
+	nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+	if (nci < 0)
+		return nci;
+	else if (nci == 0 || ci.ci_cno != cno)
+		return -ENOENT;
+
+	/* cannot delete the latest checkpoint nor snapshots */
+	ret = nilfs_cpinfo_snapshot(&ci);
+	if (ret < 0)
+		return ret;
+	else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
+		return -EPERM;
+
+	return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
+}
+
+static struct nilfs_snapshot_list *
+nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
+				     __u64 cno,
+				     struct buffer_head *bh,
+				     void *kaddr)
+{
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+
+	if (cno != 0) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		list = &cp->cp_snapshot_list;
+	} else {
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		list = &header->ch_snapshot_list;
+	}
+	return list;
+}
+
+static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 curr, prev;
+	unsigned long curr_blkoff, prev_blkoff;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	if (nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	list = &header->ch_snapshot_list;
+	curr_bh = header_bh;
+	get_bh(curr_bh);
+	curr = 0;
+	curr_blkoff = 0;
+	prev = le64_to_cpu(list->ssl_prev);
+	while (prev > cno) {
+		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
+		curr = prev;
+		if (curr_blkoff != prev_blkoff) {
+			kunmap_atomic(kaddr, KM_USER0);
+			brelse(curr_bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
+								0, &curr_bh);
+			if (ret < 0)
+				goto out_header;
+			kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+		}
+		curr_blkoff = prev_blkoff;
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, curr, curr_bh, kaddr);
+		list = &cp->cp_snapshot_list;
+		prev = le64_to_cpu(list->ssl_prev);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_curr;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, curr, curr_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(cno);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
+	nilfs_checkpoint_set_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(cno);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, 1);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(prev_bh);
+	nilfs_mdt_mark_buffer_dirty(curr_bh);
+	nilfs_mdt_mark_buffer_dirty(cp_bh);
+	nilfs_mdt_mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_curr:
+	brelse(curr_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 next, prev;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	if (!nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+
+	list = &cp->cp_snapshot_list;
+	next = le64_to_cpu(list->ssl_next);
+	prev = le64_to_cpu(list->ssl_prev);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	if (next != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
+							&next_bh);
+		if (ret < 0)
+			goto out_header;
+	} else {
+		next_bh = header_bh;
+		get_bh(next_bh);
+	}
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_next;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, next, next_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(prev);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(next);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
+	nilfs_checkpoint_clear_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, -1);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(next_bh);
+	nilfs_mdt_mark_buffer_dirty(prev_bh);
+	nilfs_mdt_mark_buffer_dirty(cp_bh);
+	nilfs_mdt_mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_next:
+	brelse(next_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_is_snapshot -
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ *
+ * Description:
+ *
+ * Return Value: On success, 1 is returned if the checkpoint specified by
+ * @cno is a snapshot, or 0 if not. On error, one of the following negative
+ * error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+	if (ret < 0)
+		goto out;
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+	ret = nilfs_checkpoint_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_change_cpmode - change checkpoint mode
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @status: mode of checkpoint
+ *
+ * Description: nilfs_change_cpmode() changes the mode of the checkpoint
+ * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
+{
+	struct the_nilfs *nilfs;
+	int ret;
+
+	nilfs = NILFS_MDT(cpfile)->mi_nilfs;
+
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		/*
+		 * Check for protecting existing snapshot mounts:
+		 * bd_mount_sem is used to make this operation atomic and
+		 * exclusive with a new mount job.  Though it doesn't cover
+		 * umount, it's enough for the purpose.
+		 */
+		down(&nilfs->ns_bdev->bd_mount_sem);
+		if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
+			/* Current implementation does not have to protect
+			   plain read-only mounts since they are exclusive
+			   with a read/write mount and are protected from the
+			   cleaner. */
+			ret = -EBUSY;
+		} else
+			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
+		up(&nilfs->ns_bdev->bd_mount_sem);
+		return ret;
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_set_snapshot(cpfile, cno);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_get_stat - get checkpoint statistics
+ * @cpfile: inode of checkpoint file
+ * @stat: pointer to a structure of checkpoint statistics
+ *
+ * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+	cpstat->cs_cno = nilfs_mdt_cno(cpfile);
+	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
+	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000000..1a8a1008c342
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
+/*
+ * cpfile.h - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_CPFILE_H
+#define _NILFS_CPFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+
+#define NILFS_CPFILE_GFP	NILFS_MDT_GFP
+
+
+int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
+				struct nilfs_checkpoint **,
+				struct buffer_head **);
+void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
+int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
+int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
+int nilfs_cpfile_is_snapshot(struct inode *, __u64);
+int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
+				struct nilfs_cpinfo *, size_t);
+
+#endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 000000000000..bb8a5818e7f1
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
+/*
+ * dat.c - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "dat.h"
+
+
+#define NILFS_CNO_MIN	((__u64)1)
+#define NILFS_CNO_MAX	(~(__u64)0)
+
+static int nilfs_dat_prepare_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req, int create)
+{
+	return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+					    create, &req->pr_entry_bh);
+}
+
+static void nilfs_dat_commit_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req)
+{
+	nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+	brelse(req->pr_entry_bh);
+}
+
+static void nilfs_dat_abort_entry(struct inode *dat,
+				  struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_entry_bh);
+}
+
+int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+	if (ret < 0)
+		return ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 1);
+	if (ret < 0)
+		nilfs_palloc_abort_alloc_entry(dat, req);
+
+	return ret;
+}
+
+void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_palloc_commit_alloc_entry(dat, req);
+	nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+	nilfs_palloc_abort_alloc_entry(dat, req);
+}
+
+int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_palloc_prepare_free_entry(dat, req);
+	if (ret < 0)
+		return ret;
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	if (ret < 0) {
+		nilfs_palloc_abort_free_entry(dat, req);
+		return ret;
+	}
+	return 0;
+}
+
+void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_dat_commit_entry(dat, req);
+	nilfs_palloc_commit_free_entry(dat, req);
+}
+
+void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+	nilfs_palloc_abort_free_entry(dat, req);
+}
+
+int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	WARN_ON(ret == -ENOENT);
+	return ret;
+}
+
+void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+			    sector_t blocknr)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
+	if (entry->de_blocknr != cpu_to_le64(0) ||
+	    entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
+		printk(KERN_CRIT
+		       "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
+		       __func__, (unsigned long long)req->pr_entry_nr,
+		       (unsigned long long)le64_to_cpu(entry->de_start),
+		       (unsigned long long)le64_to_cpu(entry->de_end),
+		       (unsigned long long)le64_to_cpu(entry->de_blocknr));
+	}
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+}
+
+int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (blocknr == 0) {
+		ret = nilfs_palloc_prepare_free_entry(dat, req);
+		if (ret < 0) {
+			nilfs_dat_abort_entry(dat, req);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
+			  int dead)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start, end;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	end = start = le64_to_cpu(entry->de_start);
+	if (!dead) {
+		end = nilfs_mdt_cno(dat);
+		WARN_ON(start > end);
+	}
+	entry->de_end = cpu_to_le64(end);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (blocknr == 0)
+		nilfs_dat_commit_free(dat, req);
+	else
+		nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (start == nilfs_mdt_cno(dat) && blocknr == 0)
+		nilfs_palloc_abort_free_entry(dat, req);
+	nilfs_dat_abort_entry(dat, req);
+}
+
+/**
+ * nilfs_dat_mark_dirty -
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = vblocknr;
+	ret = nilfs_dat_prepare_entry(dat, &req, 0);
+	if (ret == 0)
+		nilfs_dat_commit_entry(dat, &req);
+	return ret;
+}
+
+/**
+ * nilfs_dat_freev - free virtual block numbers
+ * @dat: DAT file inode
+ * @vblocknrs: array of virtual block numbers
+ * @nitems: number of virtual block numbers
+ *
+ * Description: nilfs_dat_freev() frees the virtual block numbers specified by
+ * @vblocknrs and @nitems.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * nagative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
+int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
+{
+	return nilfs_palloc_freev(dat, vblocknrs, nitems);
+}
+
+/**
+ * nilfs_dat_move - change a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknr: block number
+ *
+ * Description: nilfs_dat_move() changes the block number associated with
+ * @vblocknr to @blocknr.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
+		printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
+		       (unsigned long long)vblocknr,
+		       (unsigned long long)le64_to_cpu(entry->de_start),
+		       (unsigned long long)le64_to_cpu(entry->de_end));
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(entry_bh);
+		return -EINVAL;
+	}
+	WARN_ON(blocknr == 0);
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+
+	brelse(entry_bh);
+
+	return 0;
+}
+
+/**
+ * nilfs_dat_translate - translate a virtual block number to a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknrp: pointer to a block number
+ *
+ * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
+ * to the corresponding block number.
+ *
+ * Return Value: On success, 0 is returned and the block number associated
+ * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A block number associated with @vblocknr does not exist.
+ */
+int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+
+	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	if (blocknr == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (blocknrp != NULL)
+		*blocknrp = blocknr;
+
+ out:
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(entry_bh);
+	return ret;
+}
+
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+			    size_t nvi)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	__u64 first, last;
+	void *kaddr;
+	unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
+	int i, j, n, ret;
+
+	for (i = 0; i < nvi; i += n) {
+		ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+						   0, &entry_bh);
+		if (ret < 0)
+			return ret;
+		kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+		/* last virtual block number in this block */
+		first = vinfo[i].vi_vblocknr;
+		do_div(first, entries_per_block);
+		first *= entries_per_block;
+		last = first + entries_per_block - 1;
+		for (j = i, n = 0;
+		     j < nvi && vinfo[j].vi_vblocknr >= first &&
+			     vinfo[j].vi_vblocknr <= last;
+		     j++, n++) {
+			entry = nilfs_palloc_block_get_entry(
+				dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
+			vinfo[j].vi_start = le64_to_cpu(entry->de_start);
+			vinfo[j].vi_end = le64_to_cpu(entry->de_end);
+			vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(entry_bh);
+	}
+
+	return nvi;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 000000000000..d9560654a4b7
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
+/*
+ * dat.h - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DAT_H
+#define _NILFS_DAT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+#define NILFS_DAT_GFP	NILFS_MDT_GFP
+
+struct nilfs_palloc_req;
+
+int nilfs_dat_translate(struct inode *, __u64, sector_t *);
+
+int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
+			    sector_t);
+void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+
+int nilfs_dat_mark_dirty(struct inode *, __u64);
+int nilfs_dat_freev(struct inode *, __u64 *, size_t);
+int nilfs_dat_move(struct inode *, __u64, sector_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+
+#endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 000000000000..54100acc1102
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
+/*
+ * dir.c - NILFS directory entry operations
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include "nilfs.h"
+#include "page.h"
+
+/*
+ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned nilfs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void nilfs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int nilfs_prepare_chunk_uninterruptible(struct page *page,
+					       struct address_space *mapping,
+					       unsigned from, unsigned to)
+{
+	loff_t pos = page_offset(page) + from;
+	return block_write_begin(NULL, mapping, pos, to - from,
+				 AOP_FLAG_UNINTERRUPTIBLE, &page,
+				 NULL, nilfs_get_block);
+}
+
+static int nilfs_prepare_chunk(struct page *page,
+			       struct address_space *mapping,
+			       unsigned from, unsigned to)
+{
+	loff_t pos = page_offset(page) + from;
+	return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
+				 NULL, nilfs_get_block);
+}
+
+static int nilfs_commit_chunk(struct page *page,
+			      struct address_space *mapping,
+			      unsigned from, unsigned to)
+{
+	struct inode *dir = mapping->host;
+	struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
+	loff_t pos = page_offset(page) + from;
+	unsigned len = to - from;
+	unsigned nr_dirty, copied;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
+	copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos + copied > dir->i_size) {
+		i_size_write(dir, pos + copied);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+	unlock_page(page);
+	return err;
+}
+
+static void nilfs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct nilfs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct nilfs_dir_entry *)(kaddr + offs);
+		rec_len = le16_to_cpu(p->rec_len);
+
+		if (rec_len < NILFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	nilfs_error(sb, "nilfs_check_page",
+		    "size of directory #%lu is not a multiple of chunk size",
+		    dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+bad_entry:
+	nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
+		    "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+		    dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode),
+		    rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct nilfs_dir_entry *)(kaddr + offs);
+	nilfs_error(sb, "nilfs_check_page",
+		    "entry in directory #%lu spans the page boundary"
+		    "offset=%lu, inode=%lu",
+		    dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_cache_page(mapping, n,
+				(filler_t *)mapping->a_ops->readpage, NULL);
+	if (!IS_ERR(page)) {
+		wait_on_page_locked(page);
+		kmap(page);
+		if (!PageUptodate(page))
+			goto fail;
+		if (!PageChecked(page))
+			nilfs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	nilfs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static int
+nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
+{
+	return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static unsigned char
+nilfs_filetype_table[NILFS_FT_MAX] = {
+	[NILFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[NILFS_FT_REG_FILE]	= DT_REG,
+	[NILFS_FT_DIR]		= DT_DIR,
+	[NILFS_FT_CHRDEV]	= DT_CHR,
+	[NILFS_FT_BLKDEV]	= DT_BLK,
+	[NILFS_FT_FIFO]		= DT_FIFO,
+	[NILFS_FT_SOCK]		= DT_SOCK,
+	[NILFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char
+nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= NILFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= NILFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= NILFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= NILFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= NILFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= NILFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= NILFS_FT_SYMLINK,
+};
+
+static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+/*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
+	unsigned char *types = NULL;
+	int ret;
+
+	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
+		goto success;
+
+	types = nilfs_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct nilfs_dir_entry *de;
+		struct page *page = nilfs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			nilfs_error(sb, __func__, "bad page in #%lu",
+				    inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ret = -EIO;
+			goto done;
+		}
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)(kaddr + offset);
+		limit = kaddr + nilfs_last_byte(inode, n) -
+			NILFS_DIR_REC_LEN(1);
+		for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				nilfs_error(sb, __func__,
+					    "zero-length directory entry");
+				ret = -EIO;
+				nilfs_put_page(page);
+				goto done;
+			}
+			if (de->inode) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < NILFS_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le64_to_cpu(de->inode), d_type);
+				if (over) {
+					nilfs_put_page(page);
+					goto success;
+				}
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		nilfs_put_page(page);
+	}
+
+success:
+	ret = 0;
+done:
+	return ret;
+}
+
+/*
+ *	nilfs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *dir, struct dentry *dentry,
+		 struct page **res_page)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct nilfs_inode_info *ei = NILFS_I(dir);
+	struct nilfs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ei->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = nilfs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct nilfs_dir_entry *)kaddr;
+			kaddr += nilfs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					nilfs_error(dir->i_sb, __func__,
+						"zero-length directory entry");
+					nilfs_put_page(page);
+					goto out;
+				}
+				if (nilfs_match(namelen, name, de))
+					goto found;
+				de = nilfs_next_entry(de);
+			}
+			nilfs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+		/* next page is past the blocks we've got */
+		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+			nilfs_error(dir->i_sb, __func__,
+			       "dir %lu size %lld exceeds block cout %llu",
+			       dir->i_ino, dir->i_size,
+			       (unsigned long long)dir->i_blocks);
+			goto out;
+		}
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ei->i_dir_start_lookup = n;
+	return de;
+}
+
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = nilfs_get_page(dir, 0);
+	struct nilfs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = nilfs_next_entry(
+			(struct nilfs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+
+	de = nilfs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode);
+		kunmap(page);
+		page_cache_release(page);
+	}
+	return res;
+}
+
+/* Releases the page */
+void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+		    struct page *page, struct inode *inode)
+{
+	unsigned from = (char *) de - (char *) page_address(page);
+	unsigned to = from + le16_to_cpu(de->rec_len);
+	struct address_space *mapping = page->mapping;
+	int err;
+
+	lock_page(page);
+	err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
+	BUG_ON(err);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	err = nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+	mark_inode_dirty(dir);
+}
+
+/*
+ *	Parent is locked.
+ */
+int nilfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct nilfs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	unsigned from, to;
+	int err;
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = nilfs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + nilfs_last_byte(dir, n);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = cpu_to_le16(chunk_size);
+				de->inode = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				nilfs_error(dir->i_sb, __func__,
+					    "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (nilfs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = NILFS_DIR_REC_LEN(de->name_len);
+			rec_len = le16_to_cpu(de->rec_len);
+			if (!de->inode && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct nilfs_dir_entry *)((char *)de + rec_len);
+		}
+		unlock_page(page);
+		nilfs_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	from = (char *)de - (char *)page_address(page);
+	to = from + rec_len;
+	err = nilfs_prepare_chunk(page, page->mapping, from, to);
+	if (err)
+		goto out_unlock;
+	if (de->inode) {
+		struct nilfs_dir_entry *de1;
+
+		de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = cpu_to_le16(rec_len - name_len);
+		de->rec_len = cpu_to_le16(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	err = nilfs_commit_chunk(page, page->mapping, from, to);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	nilfs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+/*
+ * nilfs_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	struct nilfs_dir_entry *pde = NULL;
+	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+	int err;
+
+	while ((char *)de < (char *)dir) {
+		if (de->rec_len == 0) {
+			nilfs_error(inode->i_sb, __func__,
+				    "zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = nilfs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	lock_page(page);
+	err = nilfs_prepare_chunk(page, mapping, from, to);
+	BUG_ON(err);
+	if (pde)
+		pde->rec_len = cpu_to_le16(to - from);
+	dir->inode = 0;
+	err = nilfs_commit_chunk(page, mapping, from, to);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+/*	NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
+	mark_inode_dirty(inode);
+out:
+	nilfs_put_page(page);
+	return err;
+}
+
+/*
+ * Set the first fragment of directory.
+ */
+int nilfs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = nilfs_chunk_size(inode);
+	struct nilfs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
+	if (unlikely(err)) {
+		unlock_page(page);
+		goto fail;
+	}
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, chunk_size);
+	de = (struct nilfs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+	memcpy(de->name, ".\0\0", 4);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+
+	de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+	de->inode = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, "..\0", 4);
+	nilfs_set_de_type(de, inode);
+	kunmap_atomic(kaddr, KM_USER0);
+	err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int nilfs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct nilfs_dir_entry *de;
+
+		page = nilfs_get_page(inode, i);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				nilfs_error(inode->i_sb, __func__,
+					    "zero-length directory entry "
+					    "(kaddr=%p, de=%p)\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (de->inode !=
+					    cpu_to_le64(inode->i_ino))
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = nilfs_next_entry(de);
+		}
+		nilfs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	nilfs_put_page(page);
+	return 0;
+}
+
+struct file_operations nilfs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= nilfs_readdir,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.fsync		= nilfs_sync_file,
+
+};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 000000000000..c6379e482781
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
+/*
+ * direct.c - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "page.h"
+#include "direct.h"
+#include "alloc.h"
+
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
+{
+	return (__le64 *)
+		((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
+}
+
+static inline __u64
+nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
+{
+	return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
+}
+
+static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
+					__u64 key, __u64 ptr)
+{
+	*(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
+}
+
+static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
+			       __u64 key, int level, __u64 *ptrp)
+{
+	struct nilfs_direct *direct;
+	__u64 ptr;
+
+	direct = (struct nilfs_direct *)bmap;
+	if ((key > NILFS_DIRECT_KEY_MAX) ||
+	    (level != 1) ||	/* XXX: use macro for level 1 */
+	    ((ptr = nilfs_direct_get_ptr(direct, key)) ==
+	     NILFS_BMAP_INVALID_PTR))
+		return -ENOENT;
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+	return 0;
+}
+
+static __u64
+nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else
+		/* block group */
+		return nilfs_bmap_find_target_in_group(&direct->d_bmap);
+}
+
+static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
+				      __u64 key, __u64 ptr)
+{
+	direct->d_bmap.b_last_allocated_key = key;
+	direct->d_bmap.b_last_allocated_ptr = ptr;
+}
+
+static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
+				       __u64 key,
+				       union nilfs_bmap_ptr_req *req,
+				       struct nilfs_bmap_stats *stats)
+{
+	int ret;
+
+	if (direct->d_ops->dop_find_target != NULL)
+		req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
+	ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
+							       req);
+	if (ret < 0)
+		return ret;
+
+	stats->bs_nblocks = 1;
+	return 0;
+}
+
+static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
+				       union nilfs_bmap_ptr_req *req,
+				       __u64 key, __u64 ptr)
+{
+	struct buffer_head *bh;
+
+	/* ptr must be a pointer to a buffer head. */
+	bh = (struct buffer_head *)((unsigned long)ptr);
+	set_buffer_nilfs_volatile(bh);
+
+	if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
+		direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
+			&direct->d_bmap, req);
+	nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
+
+	if (!nilfs_bmap_dirty(&direct->d_bmap))
+		nilfs_bmap_set_dirty(&direct->d_bmap);
+
+	if (direct->d_ops->dop_set_target != NULL)
+		direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+}
+
+static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	struct nilfs_direct *direct;
+	union nilfs_bmap_ptr_req req;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	direct = (struct nilfs_direct *)bmap;
+	if (key > NILFS_DIRECT_KEY_MAX)
+		return -ENOENT;
+	if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
+		return -EEXIST;
+
+	ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_direct_commit_insert(direct, &req, key, ptr);
+	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+
+	return 0;
+}
+
+static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
+				       union nilfs_bmap_ptr_req *req,
+				       __u64 key,
+				       struct nilfs_bmap_stats *stats)
+{
+	int ret;
+
+	if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+		req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+		ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
+			&direct->d_bmap, req);
+		if (ret < 0)
+			return ret;
+	}
+
+	stats->bs_nblocks = 1;
+	return 0;
+}
+
+static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
+				       union nilfs_bmap_ptr_req *req,
+				       __u64 key)
+{
+	if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
+		direct->d_bmap.b_pops->bpop_commit_end_ptr(
+			&direct->d_bmap, req);
+	nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+}
+
+static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	struct nilfs_direct *direct;
+	union nilfs_bmap_ptr_req req;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	direct = (struct nilfs_direct *)bmap;
+	if ((key > NILFS_DIRECT_KEY_MAX) ||
+	    nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_direct_commit_delete(direct, &req, key);
+	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+
+	return 0;
+}
+
+static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+	struct nilfs_direct *direct;
+	__u64 key, lastkey;
+
+	direct = (struct nilfs_direct *)bmap;
+	lastkey = NILFS_DIRECT_KEY_MAX + 1;
+	for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
+		if (nilfs_direct_get_ptr(direct, key) !=
+		    NILFS_BMAP_INVALID_PTR)
+			lastkey = key;
+
+	if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
+		return -ENOENT;
+
+	*keyp = lastkey;
+
+	return 0;
+}
+
+static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
+{
+	return key > NILFS_DIRECT_KEY_MAX;
+}
+
+static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
+				    __u64 *keys, __u64 *ptrs, int nitems)
+{
+	struct nilfs_direct *direct;
+	__u64 key;
+	__u64 ptr;
+	int n;
+
+	direct = (struct nilfs_direct *)bmap;
+	if (nitems > NILFS_DIRECT_NBLOCKS)
+		nitems = NILFS_DIRECT_NBLOCKS;
+	n = 0;
+	for (key = 0; key < nitems; key++) {
+		ptr = nilfs_direct_get_ptr(direct, key);
+		if (ptr != NILFS_BMAP_INVALID_PTR) {
+			keys[n] = key;
+			ptrs[n] = ptr;
+			n++;
+		}
+	}
+	return n;
+}
+
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
+				    __u64 key, __u64 *keys, __u64 *ptrs,
+				    int n, __u64 low, __u64 high)
+{
+	struct nilfs_direct *direct;
+	__le64 *dptrs;
+	int ret, i, j;
+
+	/* no need to allocate any resource for conversion */
+
+	/* delete */
+	ret = bmap->b_ops->bop_delete(bmap, key);
+	if (ret < 0)
+		return ret;
+
+	/* free resources */
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+
+	/* convert */
+	direct = (struct nilfs_direct *)bmap;
+	dptrs = nilfs_direct_dptrs(direct);
+	for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
+		if ((j < n) && (i == keys[j])) {
+			dptrs[i] = (i != key) ?
+				nilfs_bmap_ptr_to_dptr(ptrs[j]) :
+				NILFS_BMAP_INVALID_PTR;
+			j++;
+		} else
+			dptrs[i] = NILFS_BMAP_INVALID_PTR;
+	}
+
+	nilfs_direct_init(bmap, low, high);
+
+	return 0;
+}
+
+static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
+				    struct buffer_head *bh)
+{
+	union nilfs_bmap_ptr_req oldreq, newreq;
+	__u64 key;
+	__u64 ptr;
+	int ret;
+
+	key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (!buffer_nilfs_volatile(bh)) {
+		oldreq.bpr_ptr = ptr;
+		newreq.bpr_ptr = ptr;
+		ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
+						&newreq);
+		if (ret < 0)
+			return ret;
+		nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+		set_buffer_nilfs_volatile(bh);
+		nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+	} else
+		ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+
+	return ret;
+}
+
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+				  struct buffer_head *bh)
+{
+	struct nilfs_direct *direct;
+
+	direct = (struct nilfs_direct *)bmap;
+	return (direct->d_ops->dop_propagate != NULL) ?
+		direct->d_ops->dop_propagate(direct, bh) :
+		0;
+}
+
+static int nilfs_direct_assign_v(struct nilfs_direct *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	union nilfs_bmap_ptr_req req;
+	int ret;
+
+	req.bpr_ptr = ptr;
+	ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
+		&direct->d_bmap, &req);
+	if (ret < 0)
+		return ret;
+	direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
+						     &req, blocknr);
+
+	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+
+	return 0;
+}
+
+static int nilfs_direct_assign_p(struct nilfs_direct *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	nilfs_direct_set_ptr(direct, key, blocknr);
+
+	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_dat.bi_level = 0;
+
+	return 0;
+}
+
+static int nilfs_direct_assign(struct nilfs_bmap *bmap,
+			       struct buffer_head **bh,
+			       sector_t blocknr,
+			       union nilfs_binfo *binfo)
+{
+	struct nilfs_direct *direct;
+	__u64 key;
+	__u64 ptr;
+
+	direct = (struct nilfs_direct *)bmap;
+	key = nilfs_bmap_data_get_key(bmap, *bh);
+	if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
+		printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
+		       (unsigned long long)key);
+		return -EINVAL;
+	}
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
+		printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
+		       (unsigned long long)ptr);
+		return -EINVAL;
+	}
+
+	return direct->d_ops->dop_assign(direct, key, ptr, bh,
+					 blocknr, binfo);
+}
+
+static const struct nilfs_bmap_operations nilfs_direct_ops = {
+	.bop_lookup		=	nilfs_direct_lookup,
+	.bop_insert		=	nilfs_direct_insert,
+	.bop_delete		=	nilfs_direct_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_direct_propagate,
+
+	.bop_lookup_dirty_buffers	=	NULL,
+
+	.bop_assign		=	nilfs_direct_assign,
+	.bop_mark		=	NULL,
+
+	.bop_last_key		=	nilfs_direct_last_key,
+	.bop_check_insert	=	nilfs_direct_check_insert,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	nilfs_direct_gather_data,
+};
+
+
+static const struct nilfs_direct_operations nilfs_direct_ops_v = {
+	.dop_find_target	=	nilfs_direct_find_target_v,
+	.dop_set_target		=	nilfs_direct_set_target_v,
+	.dop_propagate		=	nilfs_direct_propagate_v,
+	.dop_assign		=	nilfs_direct_assign_v,
+};
+
+static const struct nilfs_direct_operations nilfs_direct_ops_p = {
+	.dop_find_target	=	NULL,
+	.dop_set_target		=	NULL,
+	.dop_propagate		=	NULL,
+	.dop_assign		=	nilfs_direct_assign_p,
+};
+
+int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+	struct nilfs_direct *direct;
+
+	direct = (struct nilfs_direct *)bmap;
+	bmap->b_ops = &nilfs_direct_ops;
+	bmap->b_low = low;
+	bmap->b_high = high;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		direct->d_ops = &nilfs_direct_ops_p;
+		break;
+	default:
+		direct->d_ops = &nilfs_direct_ops_v;
+		break;
+	}
+
+	return 0;
+}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 000000000000..45d2c5cda812
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
+/*
+ * direct.h - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DIRECT_H
+#define _NILFS_DIRECT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "bmap.h"
+
+
+struct nilfs_direct;
+
+/**
+ * struct nilfs_direct_operations - direct mapping operation table
+ */
+struct nilfs_direct_operations {
+	__u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
+	void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
+	int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
+	int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
+			  struct buffer_head **, sector_t,
+			  union nilfs_binfo *);
+};
+
+/**
+ * struct nilfs_direct_node - direct node
+ * @dn_flags: flags
+ * @dn_pad: padding
+ */
+struct nilfs_direct_node {
+	__u8 dn_flags;
+	__u8 pad[7];
+};
+
+/**
+ * struct nilfs_direct - direct mapping
+ * @d_bmap: bmap structure
+ * @d_ops: direct mapping operation table
+ */
+struct nilfs_direct {
+	struct nilfs_bmap d_bmap;
+
+	/* direct-mapping-specific members */
+	const struct nilfs_direct_operations *d_ops;
+};
+
+
+#define NILFS_DIRECT_NBLOCKS	(NILFS_BMAP_SIZE / sizeof(__le64) - 1)
+#define NILFS_DIRECT_KEY_MIN	0
+#define NILFS_DIRECT_KEY_MAX	(NILFS_DIRECT_NBLOCKS - 1)
+
+
+int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
+				    __u64 *, int, __u64, __u64);
+
+
+#endif	/* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 000000000000..6bd84a0d8238
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
+/*
+ * file.c - NILFS regular file handling primitives including fsync().
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>,
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include "nilfs.h"
+#include "segment.h"
+
+int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	/*
+	 * Called from fsync() system call
+	 * This is the only entry point that can catch write and synch
+	 * timing for both data blocks and intermediate blocks.
+	 *
+	 * This function should be implemented when the writeback function
+	 * will be implemented.
+	 */
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	if (!nilfs_inode_dirty(inode))
+		return 0;
+
+	if (datasync)
+		err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
+						    LLONG_MAX);
+	else
+		err = nilfs_construct_segment(inode->i_sb);
+
+	return err;
+}
+
+static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int ret;
+
+	if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+		return VM_FAULT_SIGBUS; /* -ENOSPC */
+
+	lock_page(page);
+	if (page->mapping != inode->i_mapping ||
+	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+		unlock_page(page);
+		return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+	}
+
+	/*
+	 * check to see if the page is mapped already (no holes)
+	 */
+	if (PageMappedToDisk(page)) {
+		unlock_page(page);
+		goto mapped;
+	}
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int fully_mapped = 1;
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_mapped(bh)) {
+				fully_mapped = 0;
+				break;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (fully_mapped) {
+			SetPageMappedToDisk(page);
+			unlock_page(page);
+			goto mapped;
+		}
+	}
+	unlock_page(page);
+
+	/*
+	 * fill hole blocks
+	 */
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+	/* never returns -ENOMEM, but may return -ENOSPC */
+	if (unlikely(ret))
+		return VM_FAULT_SIGBUS;
+
+	ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
+	if (unlikely(ret)) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
+	SetPageChecked(page);
+	wait_on_page_writeback(page);
+	return 0;
+}
+
+struct vm_operations_struct nilfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= nilfs_page_mkwrite,
+};
+
+static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+	vma->vm_ops = &nilfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the nilfs filesystem.
+ */
+struct file_operations nilfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.mmap		= nilfs_file_mmap,
+	.open		= generic_file_open,
+	/* .release	= nilfs_release_file, */
+	.fsync		= nilfs_sync_file,
+	.splice_read	= generic_file_splice_read,
+};
+
+struct inode_operations nilfs_file_inode_operations = {
+	.truncate	= nilfs_truncate,
+	.setattr	= nilfs_setattr,
+	.permission     = nilfs_permission,
+};
+
+/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 000000000000..93383c5cee90
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
+/*
+ * gcdat.c - NILFS shadow DAT inode for GC
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+
+int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
+{
+	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+	int err;
+
+	gcdat->i_state = 0;
+	gcdat->i_blocks = dat->i_blocks;
+	gii->i_flags = dii->i_flags;
+	gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
+	gii->i_cno = 0;
+	nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
+	err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
+	if (unlikely(err))
+		return err;
+
+	return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
+				      &dii->i_btnode_cache);
+}
+
+void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
+{
+	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+	struct address_space *mapping = dat->i_mapping;
+	struct address_space *gmapping = gcdat->i_mapping;
+
+	down_write(&NILFS_MDT(dat)->mi_sem);
+	dat->i_blocks = gcdat->i_blocks;
+	dii->i_flags = gii->i_flags;
+	dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
+
+	nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
+
+	nilfs_clear_dirty_pages(mapping);
+	nilfs_copy_back_pages(mapping, gmapping);
+	/* note: mdt dirty flags should be cleared by segctor. */
+
+	nilfs_clear_dirty_pages(&dii->i_btnode_cache);
+	nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
+
+	up_write(&NILFS_MDT(dat)->mi_sem);
+}
+
+void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
+{
+	struct inode *gcdat = nilfs->ns_gc_dat;
+	struct nilfs_inode_info *gii = NILFS_I(gcdat);
+
+	gcdat->i_state = I_CLEAR;
+	gii->i_flags = 0;
+
+	truncate_inode_pages(gcdat->i_mapping, 0);
+	truncate_inode_pages(&gii->i_btnode_cache, 0);
+}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000000..19d2102b6a69
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
+/*
+ * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+/*
+ * This file adds the cache of on-disk blocks to be moved in garbage
+ * collection.  The disk blocks are held with dummy inodes (called
+ * gcinodes), and this file provides lookup function of the dummy
+ * inodes and their buffer read function.
+ *
+ * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * has to treat blocks that belong to a same file but have different
+ * checkpoint numbers.  To avoid interference among generations, dummy
+ * inodes are managed separatly from actual inodes, and their lookup
+ * function (nilfs_gc_iget) is designed to be specified with a
+ * checkpoint number argument as well as an inode number.
+ *
+ * Buffers and pages held by the dummy inodes will be released each
+ * time after they are copied to a new log.  Dirty blocks made on the
+ * current generation and the blocks to be moved by GC never overlap
+ * because the dirty blocks make a new generation; they rather must be
+ * written individually.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+#include "dat.h"
+#include "ifile.h"
+
+static struct address_space_operations def_gcinode_aops = {};
+/* XXX need def_gcinode_iops/fops? */
+
+/*
+ * nilfs_gccache_submit_read_data() - add data buffer and submit read request
+ * @inode - gc inode
+ * @blkoff - dummy offset treated as the key for the page cache
+ * @pbn - physical block number of the block
+ * @vbn - virtual block number of the block, 0 for non-virtual block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_data() registers the data buffer
+ * specified by @pbn to the GC pagecache with the key @blkoff.
+ * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The block specified with @pbn does not exist.
+ */
+int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
+				   sector_t pbn, __u64 vbn,
+				   struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (pbn == 0) {
+		struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
+					  /* use original dat, not gc dat. */
+		err = nilfs_dat_translate(dat_inode, vbn, &pbn);
+		if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
+			brelse(bh);
+			goto failed;
+		}
+	}
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	if (!buffer_mapped(bh)) {
+		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+		set_buffer_mapped(bh);
+	}
+	bh->b_blocknr = pbn;
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	if (vbn)
+		bh->b_blocknr = vbn;
+ out:
+	err = 0;
+	*out_bh = bh;
+
+ failed:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return err;
+}
+
+/*
+ * nilfs_gccache_submit_read_node() - add node buffer and submit read request
+ * @inode - gc inode
+ * @pbn - physical block number for the block
+ * @vbn - virtual block number for the block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_node() registers the node buffer
+ * specified by @vbn to the GC pagecache.  @pbn can be supplied by the
+ * caller to avoid translation of the disk block address.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
+				   __u64 vbn, struct buffer_head **out_bh)
+{
+	int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+					    vbn ? : pbn, pbn, out_bh, 0);
+	if (ret == -EEXIST) /* internal code (cache hit) */
+		ret = 0;
+	return ret;
+}
+
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
+{
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh))
+		return -EIO;
+	if (buffer_dirty(bh))
+		return -EEXIST;
+
+	if (buffer_nilfs_node(bh))
+		nilfs_btnode_mark_dirty(bh);
+	else
+		nilfs_mdt_mark_buffer_dirty(bh);
+	return 0;
+}
+
+/*
+ * nilfs_init_gccache() - allocate and initialize gc_inode hash table
+ * @nilfs - the_nilfs
+ *
+ * Return Value: On success, 0.
+ * On error, a negative error code is returned.
+ */
+int nilfs_init_gccache(struct the_nilfs *nilfs)
+{
+	int loop;
+
+	BUG_ON(nilfs->ns_gc_inodes_h);
+
+	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+
+	nilfs->ns_gc_inodes_h =
+		kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
+			GFP_NOFS);
+	if (nilfs->ns_gc_inodes_h == NULL)
+		return -ENOMEM;
+
+	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
+		INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
+	return 0;
+}
+
+/*
+ * nilfs_destroy_gccache() - free gc_inode hash table
+ * @nilfs - the nilfs
+ */
+void nilfs_destroy_gccache(struct the_nilfs *nilfs)
+{
+	if (nilfs->ns_gc_inodes_h) {
+		nilfs_remove_all_gcinode(nilfs);
+		kfree(nilfs->ns_gc_inodes_h);
+		nilfs->ns_gc_inodes_h = NULL;
+	}
+}
+
+static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
+				   __u64 cno)
+{
+	struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+	struct nilfs_inode_info *ii;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_op = NULL;
+	inode->i_fop = NULL;
+	inode->i_mapping->a_ops = &def_gcinode_aops;
+
+	ii = NILFS_I(inode);
+	ii->i_cno = cno;
+	ii->i_flags = 0;
+	ii->i_state = 1 << NILFS_I_GCINODE;
+	ii->i_bh = NULL;
+	nilfs_bmap_init_gc(ii->i_bmap);
+
+	return inode;
+}
+
+static unsigned long ihash(ino_t ino, __u64 cno)
+{
+	return hash_long((unsigned long)((ino << 2) + cno),
+			 NILFS_GCINODE_HASH_BITS);
+}
+
+/*
+ * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
+ */
+struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
+{
+	struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
+	struct hlist_node *node;
+	struct inode *inode;
+
+	hlist_for_each_entry(inode, node, head, i_hash) {
+		if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
+			return inode;
+	}
+
+	inode = alloc_gcinode(nilfs, ino, cno);
+	if (likely(inode)) {
+		hlist_add_head(&inode->i_hash, head);
+		list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
+	}
+	return inode;
+}
+
+/*
+ * nilfs_clear_gcinode() - clear and free a gc inode
+ */
+void nilfs_clear_gcinode(struct inode *inode)
+{
+	nilfs_mdt_clear(inode);
+	nilfs_mdt_destroy(inode);
+}
+
+/*
+ * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+ */
+void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+{
+	struct hlist_head *head = nilfs->ns_gc_inodes_h;
+	struct hlist_node *node, *n;
+	struct inode *inode;
+	int loop;
+
+	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
+		hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
+			hlist_del_init(&inode->i_hash);
+			list_del_init(&NILFS_I(inode)->i_dirty);
+			nilfs_clear_gcinode(inode); /* might sleep */
+		}
+	}
+}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 000000000000..de86401f209f
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
+/*
+ * ifile.c - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "ifile.h"
+
+/**
+ * nilfs_ifile_create_inode - create a new disk inode
+ * @ifile: ifile inode
+ * @out_ino: pointer to a variable to store inode number
+ * @out_bh: buffer_head contains newly allocated disk inode
+ *
+ * Return Value: On success, 0 is returned and the newly allocated inode
+ * number is stored in the place pointed by @ino, and buffer_head pointer
+ * that contains newly allocated disk inode structure is stored in the
+ * place pointed by @out_bh
+ * On error, one of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No inode left.
+ */
+int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
+			     struct buffer_head **out_bh)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
+				 a group. dull code!! */
+	req.pr_entry_bh = NULL;
+
+	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_alloc_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+	nilfs_palloc_commit_alloc_entry(ifile, &req);
+	nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+	nilfs_mdt_mark_dirty(ifile);
+	*out_ino = (ino_t)req.pr_entry_nr;
+	*out_bh = req.pr_entry_bh;
+	return 0;
+}
+
+/**
+ * nilfs_ifile_delete_inode - delete a disk inode
+ * @ifile: ifile inode
+ * @ino: inode number
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The inode number @ino have not been allocated.
+ */
+int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
+{
+	struct nilfs_palloc_req req = {
+		.pr_entry_nr = ino, .pr_entry_bh = NULL
+	};
+	struct nilfs_inode *raw_inode;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_prepare_free_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_free_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
+	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
+						 req.pr_entry_bh, kaddr);
+	raw_inode->i_flags = 0;
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+	brelse(req.pr_entry_bh);
+
+	nilfs_palloc_commit_free_entry(ifile, &req);
+
+	return 0;
+}
+
+int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
+				struct buffer_head **out_bh)
+{
+	struct super_block *sb = ifile->i_sb;
+	int err;
+
+	if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
+		nilfs_error(sb, __func__, "bad inode number: %lu",
+			    (unsigned long) ino);
+		return -EINVAL;
+	}
+
+	err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
+	if (unlikely(err)) {
+		if (err == -EINVAL)
+			nilfs_error(sb, __func__, "ifile is broken");
+		else
+			nilfs_warning(sb, __func__,
+				      "unable to read inode: %lu",
+				      (unsigned long) ino);
+	}
+	return err;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 000000000000..5d30a35679b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
+/*
+ * ifile.h - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _NILFS_IFILE_H
+#define _NILFS_IFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "alloc.h"
+
+#define NILFS_IFILE_GFP  NILFS_MDT_GFP
+
+static inline struct nilfs_inode *
+nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
+{
+	void *kaddr = kmap(ibh->b_page);
+	return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
+}
+
+static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
+					   struct buffer_head *ibh)
+{
+	kunmap(ibh->b_page);
+}
+
+int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
+int nilfs_ifile_delete_inode(struct inode *, ino_t);
+int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+
+#endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 000000000000..49ab4a49bb4f
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
+/*
+ * inode.c - NILFS inode operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#include "cpfile.h"
+#include "ifile.h"
+
+
+/**
+ * nilfs_get_block() - get a file block on the filesystem (callback function)
+ * @inode - inode struct of the target file
+ * @blkoff - file block number
+ * @bh_result - buffer head to be mapped on
+ * @create - indicate whether allocating the block or not when it has not
+ *      been allocated yet.
+ *
+ * This function does not issue actual read request of the specified data
+ * block. It is done by VFS.
+ * Bulk read for direct-io is not supported yet. (should be supported)
+ */
+int nilfs_get_block(struct inode *inode, sector_t blkoff,
+		    struct buffer_head *bh_result, int create)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	unsigned long blknum = 0;
+	int err = 0, ret;
+	struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+
+	/* This exclusion control is a workaround; should be revised */
+	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	if (ret == 0) {	/* found */
+		map_bh(bh_result, inode->i_sb, blknum);
+		goto out;
+	}
+	/* data block was not found */
+	if (ret == -ENOENT && create) {
+		struct nilfs_transaction_info ti;
+
+		bh_result->b_blocknr = 0;
+		err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+		if (unlikely(err))
+			goto out;
+		err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
+					(unsigned long)bh_result);
+		if (unlikely(err != 0)) {
+			if (err == -EEXIST) {
+				/*
+				 * The get_block() function could be called
+				 * from multiple callers for an inode.
+				 * However, the page having this block must
+				 * be locked in this case.
+				 */
+				printk(KERN_WARNING
+				       "nilfs_get_block: a race condition "
+				       "while inserting a data block. "
+				       "(inode number=%lu, file block "
+				       "offset=%llu)\n",
+				       inode->i_ino,
+				       (unsigned long long)blkoff);
+				err = 0;
+			} else if (err == -EINVAL) {
+				nilfs_error(inode->i_sb, __func__,
+					    "broken bmap (inode=%lu)\n",
+					    inode->i_ino);
+				err = -EIO;
+			}
+			nilfs_transaction_abort(inode->i_sb);
+			goto out;
+		}
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+		/* Error handling should be detailed */
+		set_buffer_new(bh_result);
+		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
+						      to proper value */
+	} else if (ret == -ENOENT) {
+		/* not found is not error (e.g. hole); must return without
+		   the mapped state flag. */
+		;
+	} else {
+		err = ret;
+	}
+
+ out:
+	return err;
+}
+
+/**
+ * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @page - the page to be read
+ */
+static int nilfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, nilfs_get_block);
+}
+
+/**
+ * nilfs_readpages() - implement readpages() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @mapping - address_space struct used for reading multiple pages
+ * @pages - the pages to be read
+ * @nr_pages - number of pages to be read
+ */
+static int nilfs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
+}
+
+static int nilfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int err = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_dsync_segment(inode->i_sb, inode,
+						    wbc->range_start,
+						    wbc->range_end);
+	return err;
+}
+
+static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		err = nilfs_construct_segment(inode->i_sb);
+		if (unlikely(err))
+			return err;
+	} else if (wbc->for_reclaim)
+		nilfs_flush_segment(inode->i_sb, inode->i_ino);
+
+	return 0;
+}
+
+static int nilfs_set_page_dirty(struct page *page)
+{
+	int ret = __set_page_dirty_buffers(page);
+
+	if (ret) {
+		struct inode *inode = page->mapping->host;
+		struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+
+		nilfs_set_file_dirty(sbi, inode, nr_dirty);
+	}
+	return ret;
+}
+
+static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+
+{
+	struct inode *inode = mapping->host;
+	int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
+
+	if (unlikely(err))
+		return err;
+
+	*pagep = NULL;
+	err = block_write_begin(file, mapping, pos, len, flags, pagep,
+				fsdata, nilfs_get_block);
+	if (unlikely(err))
+		nilfs_transaction_abort(inode->i_sb);
+	return err;
+}
+
+static int nilfs_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned nr_dirty;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, start,
+						  start + copied);
+	copied = generic_write_end(file, mapping, pos, len, copied, page,
+				   fsdata);
+	nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+	err = nilfs_transaction_commit(inode->i_sb);
+	return err ? : copied;
+}
+
+static ssize_t
+nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+		loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t size;
+
+	if (rw == WRITE)
+		return 0;
+
+	/* Needs synchronization with the cleaner */
+	size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, nilfs_get_block, NULL);
+	return size;
+}
+
+struct address_space_operations nilfs_aops = {
+	.writepage		= nilfs_writepage,
+	.readpage		= nilfs_readpage,
+	/* .sync_page		= nilfs_sync_page, */
+	.writepages		= nilfs_writepages,
+	.set_page_dirty		= nilfs_set_page_dirty,
+	.readpages		= nilfs_readpages,
+	.write_begin		= nilfs_write_begin,
+	.write_end		= nilfs_write_end,
+	/* .releasepage		= nilfs_releasepage, */
+	.invalidatepage		= block_invalidatepage,
+	.direct_IO		= nilfs_direct_IO,
+};
+
+struct inode *nilfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct inode *inode;
+	struct nilfs_inode_info *ii;
+	int err = -ENOMEM;
+	ino_t ino;
+
+	inode = new_inode(sb);
+	if (unlikely(!inode))
+		goto failed;
+
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+
+	ii = NILFS_I(inode);
+	ii->i_state = 1 << NILFS_I_NEW;
+
+	err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+	if (unlikely(err))
+		goto failed_ifile_create_inode;
+	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+
+	atomic_inc(&sbi->s_inodes_count);
+
+	inode->i_uid = current_fsuid();
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+
+	inode->i_mode = mode;
+	inode->i_ino = ino;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, NULL);
+		if (err < 0)
+			goto failed_bmap;
+
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+
+	ii->i_flags = NILFS_I(dir)->i_flags;
+	if (S_ISLNK(mode))
+		ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
+	if (!S_ISDIR(mode))
+		ii->i_flags &= ~NILFS_DIRSYNC_FL;
+
+	/* ii->i_file_acl = 0; */
+	/* ii->i_dir_acl = 0; */
+	ii->i_dir_start_lookup = 0;
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+	ii->i_acl = NULL;
+	ii->i_default_acl = NULL;
+#endif
+	ii->i_cno = 0;
+	nilfs_set_inode_flags(inode);
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	insert_inode_hash(inode);
+
+	err = nilfs_init_acl(inode, dir);
+	if (unlikely(err))
+		goto failed_acl; /* never occur. When supporting
+				    nilfs_init_acl(), proper cancellation of
+				    above jobs should be considered */
+
+	mark_inode_dirty(inode);
+	return inode;
+
+ failed_acl:
+ failed_bmap:
+	inode->i_nlink = 0;
+	iput(inode);  /* raw_inode will be deleted through
+			 generic_delete_inode() */
+	goto failed;
+
+ failed_ifile_create_inode:
+	make_bad_inode(inode);
+	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
+			 called */
+ failed:
+	return ERR_PTR(err);
+}
+
+void nilfs_free_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+	clear_inode(inode);
+	/* XXX: check error code? Is there any thing I can do? */
+	(void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
+	atomic_dec(&sbi->s_inodes_count);
+}
+
+void nilfs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = NILFS_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+			    S_DIRSYNC);
+	if (flags & NILFS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & NILFS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & NILFS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+#ifndef NILFS_ATIME_DISABLE
+	if (flags & NILFS_NOATIME_FL)
+#endif
+		inode->i_flags |= S_NOATIME;
+	if (flags & NILFS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+}
+
+int nilfs_read_inode_common(struct inode *inode,
+			    struct nilfs_inode *raw_inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
+	inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
+	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	inode->i_size = le64_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	if (inode->i_nlink == 0 && inode->i_mode == 0)
+		return -EINVAL; /* this inode is deleted */
+
+	inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
+	ii->i_flags = le32_to_cpu(raw_inode->i_flags);
+#if 0
+	ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	ii->i_dir_acl = S_ISREG(inode->i_mode) ?
+		0 : le32_to_cpu(raw_inode->i_dir_acl);
+#endif
+	ii->i_cno = 0;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+
+	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, raw_inode);
+		if (err < 0)
+			return err;
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+	return 0;
+}
+
+static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+			      struct inode *inode)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
+	struct buffer_head *bh;
+	struct nilfs_inode *raw_inode;
+	int err;
+
+	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+	if (unlikely(err))
+		goto bad_inode;
+
+	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+	ii->i_acl = NILFS_ACL_NOT_CACHED;
+	ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+#endif
+	if (nilfs_read_inode_common(inode, raw_inode))
+		goto failed_unmap;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &nilfs_dir_inode_operations;
+		inode->i_fop = &nilfs_dir_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &nilfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else {
+		inode->i_op = &nilfs_special_inode_operations;
+		init_special_inode(
+			inode, inode->i_mode,
+			new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+	}
+	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	brelse(bh);
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	nilfs_set_inode_flags(inode);
+	return 0;
+
+ failed_unmap:
+	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	brelse(bh);
+
+ bad_inode:
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	return err;
+}
+
+struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = __nilfs_read_inode(sb, ino, inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+void nilfs_write_inode_common(struct inode *inode,
+			      struct nilfs_inode *raw_inode, int has_bmap)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	raw_inode->i_uid = cpu_to_le32(inode->i_uid);
+	raw_inode->i_gid = cpu_to_le32(inode->i_gid);
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le64(inode->i_size);
+	raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
+
+	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (has_bmap)
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_device_code =
+			cpu_to_le64(new_encode_dev(inode->i_rdev));
+	/* When extending inode, nilfs->ns_inode_size should be checked
+	   for substitutions of appended fields */
+}
+
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
+{
+	ino_t ino = inode->i_ino;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_inode *raw_inode;
+
+	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+
+	/* The buffer is guarded with lock_buffer() by the caller */
+	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
+		memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+
+	nilfs_write_inode_common(inode, raw_inode, 0);
+		/* XXX: call with has_bmap = 0 is a workaround to avoid
+		   deadlock of bmap. This delays update of i_bmap to just
+		   before writing */
+	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+}
+
+#define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
+
+static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
+				unsigned long from)
+{
+	unsigned long b;
+	int ret;
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+ repeat:
+	ret = nilfs_bmap_last_key(ii->i_bmap, &b);
+	if (ret == -ENOENT)
+		return;
+	else if (ret < 0)
+		goto failed;
+
+	if (b < from)
+		return;
+
+	b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+	ret = nilfs_bmap_truncate(ii->i_bmap, b);
+	nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
+	if (!ret || (ret == -ENOMEM &&
+		     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
+		goto repeat;
+
+ failed:
+	if (ret == -EINVAL)
+		nilfs_error(ii->vfs_inode.i_sb, __func__,
+			    "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+	else
+		nilfs_warning(ii->vfs_inode.i_sb, __func__,
+			      "failed to truncate bmap (ino=%lu, err=%d)",
+			      ii->vfs_inode.i_ino, ret);
+}
+
+void nilfs_truncate(struct inode *inode)
+{
+	unsigned long blkoff;
+	unsigned int blocksize;
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	blocksize = sb->s_blocksize;
+	blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
+
+	nilfs_truncate_bmap(ii, blkoff);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+
+	nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But truncate has no return value. */
+}
+
+void nilfs_delete_inode(struct inode *inode)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (unlikely(is_bad_inode(inode))) {
+		if (inode->i_data.nrpages)
+			truncate_inode_pages(&inode->i_data, 0);
+		clear_inode(inode);
+		return;
+	}
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	if (inode->i_data.nrpages)
+		truncate_inode_pages(&inode->i_data, 0);
+
+	nilfs_truncate_bmap(ii, 0);
+	nilfs_free_inode(inode);
+	/* nilfs_free_inode() marks inode buffer dirty */
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But delete_inode has no return value. */
+}
+
+int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct nilfs_transaction_info ti;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = inode_change_ok(inode, iattr);
+	if (err)
+		return err;
+
+	err = nilfs_transaction_begin(sb, &ti, 0);
+	if (unlikely(err))
+		return err;
+	err = inode_setattr(inode, iattr);
+	if (!err && (iattr->ia_valid & ATTR_MODE))
+		err = nilfs_acl_chmod(inode);
+	if (likely(!err))
+		err = nilfs_transaction_commit(sb);
+	else
+		nilfs_transaction_abort(sb);
+
+	return err;
+}
+
+int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+			   struct buffer_head **pbh)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	spin_lock(&sbi->s_inode_lock);
+	/* Caller of this function MUST lock s_inode_lock */
+	if (ii->i_bh == NULL) {
+		spin_unlock(&sbi->s_inode_lock);
+		err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
+						  pbh);
+		if (unlikely(err))
+			return err;
+		spin_lock(&sbi->s_inode_lock);
+		if (ii->i_bh == NULL)
+			ii->i_bh = *pbh;
+		else {
+			brelse(*pbh);
+			*pbh = ii->i_bh;
+		}
+	} else
+		*pbh = ii->i_bh;
+
+	get_bh(*pbh);
+	spin_unlock(&sbi->s_inode_lock);
+	return 0;
+}
+
+int nilfs_inode_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+	int ret = 0;
+
+	if (!list_empty(&ii->i_dirty)) {
+		spin_lock(&sbi->s_inode_lock);
+		ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
+			test_bit(NILFS_I_BUSY, &ii->i_state);
+		spin_unlock(&sbi->s_inode_lock);
+	}
+	return ret;
+}
+
+int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+			 unsigned nr_dirty)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
+
+	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
+		return 0;
+
+	spin_lock(&sbi->s_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		/* Because this routine may race with nilfs_dispose_list(),
+		   we have to check NILFS_I_QUEUED here, too. */
+		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
+			/* This will happen when somebody is freeing
+			   this inode. */
+			nilfs_warning(sbi->s_super, __func__,
+				      "cannot get inode (ino=%lu)\n",
+				      inode->i_ino);
+			spin_unlock(&sbi->s_inode_lock);
+			return -EINVAL; /* NILFS_I_DIRTY may remain for
+					   freeing inode */
+		}
+		list_del(&ii->i_dirty);
+		list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
+		set_bit(NILFS_I_QUEUED, &ii->i_state);
+	}
+	spin_unlock(&sbi->s_inode_lock);
+	return 0;
+}
+
+int nilfs_mark_inode_dirty(struct inode *inode)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+	struct buffer_head *ibh;
+	int err;
+
+	err = nilfs_load_inode_block(sbi, inode, &ibh);
+	if (unlikely(err)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "failed to reget inode block.\n");
+		return err;
+	}
+	lock_buffer(ibh);
+	nilfs_update_inode(inode, ibh);
+	unlock_buffer(ibh);
+	nilfs_mdt_mark_buffer_dirty(ibh);
+	nilfs_mdt_mark_dirty(sbi->s_ifile);
+	brelse(ibh);
+	return 0;
+}
+
+/**
+ * nilfs_dirty_inode - reflect changes on given inode to an inode block.
+ * @inode: inode of the file to be registered.
+ *
+ * nilfs_dirty_inode() loads a inode block containing the specified
+ * @inode and copies data from a nilfs_inode to a corresponding inode
+ * entry in the inode block. This operation is excluded from the segment
+ * construction. This function can be called both as a single operation
+ * and as a part of indivisible file operations.
+ */
+void nilfs_dirty_inode(struct inode *inode)
+{
+	struct nilfs_transaction_info ti;
+
+	if (is_bad_inode(inode)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "tried to mark bad_inode dirty. ignored.\n");
+		dump_stack();
+		return;
+	}
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	nilfs_mark_inode_dirty(inode);
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000000..50ff3f2cdf24
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,665 @@
+/*
+ * ioctl.c - NILFS ioctl operations.
+ *
+ * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/smp_lock.h>	/* lock_kernel(), unlock_kernel() */
+#include <linux/capability.h>	/* capable() */
+#include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
+#include <linux/vmalloc.h>
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "bmap.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+
+
+static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
+				 struct nilfs_argv *argv, int dir,
+				 ssize_t (*dofunc)(struct the_nilfs *,
+						   __u64 *, int,
+						   void *, size_t, size_t))
+{
+	void *buf;
+	void __user *base = (void __user *)(unsigned long)argv->v_base;
+	size_t maxmembs, total, n;
+	ssize_t nr;
+	int ret, i;
+	__u64 pos, ppos;
+
+	if (argv->v_nmembs == 0)
+		return 0;
+
+	if (argv->v_size > PAGE_SIZE)
+		return -EINVAL;
+
+	buf = (void *)__get_free_pages(GFP_NOFS, 0);
+	if (unlikely(!buf))
+		return -ENOMEM;
+	maxmembs = PAGE_SIZE / argv->v_size;
+
+	ret = 0;
+	total = 0;
+	pos = argv->v_index;
+	for (i = 0; i < argv->v_nmembs; i += n) {
+		n = (argv->v_nmembs - i < maxmembs) ?
+			argv->v_nmembs - i : maxmembs;
+		if ((dir & _IOC_WRITE) &&
+		    copy_from_user(buf, base + argv->v_size * i,
+				   argv->v_size * n)) {
+			ret = -EFAULT;
+			break;
+		}
+		ppos = pos;
+		nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
+			       n);
+		if (nr < 0) {
+			ret = nr;
+			break;
+		}
+		if ((dir & _IOC_READ) &&
+		    copy_to_user(base + argv->v_size * i, buf,
+				 argv->v_size * nr)) {
+			ret = -EFAULT;
+			break;
+		}
+		total += nr;
+		if ((size_t)nr < n)
+			break;
+		if (pos == ppos)
+			pos += n;
+	}
+	argv->v_nmembs = total;
+
+	free_pages((unsigned long)buf, 0);
+	return ret;
+}
+
+static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
+				     unsigned int cmd, void __user *argp)
+{
+	struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+	struct nilfs_transaction_info ti;
+	struct nilfs_cpmode cpmode;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
+		return -EFAULT;
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_change_cpmode(
+		cpfile, cpmode.cm_cno, cpmode.cm_mode);
+	if (unlikely(ret < 0)) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	return ret;
+}
+
+static int
+nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
+			      unsigned int cmd, void __user *argp)
+{
+	struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+	struct nilfs_transaction_info ti;
+	__u64 cno;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&cno, argp, sizeof(cno)))
+		return -EFAULT;
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
+	if (unlikely(ret < 0)) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
+				      nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_cpstat cpstat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_sustat sustat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &sustat, sizeof(sustat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			 void *buf, size_t size, size_t nmembs)
+{
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+	up_read(&nilfs->ns_segctor_sem);
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	struct inode *dat = nilfs_dat_inode(nilfs);
+	struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	down_read(&nilfs->ns_segctor_sem);
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT) {
+				up_read(&nilfs->ns_segctor_sem);
+				return ret;
+			}
+			bdescs[i].bd_blocknr = 0;
+		}
+	}
+	up_read(&nilfs->ns_segctor_sem);
+	return nmembs;
+}
+
+static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	if (argv.v_size != sizeof(struct nilfs_bdesc))
+		return -EINVAL;
+
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+				    nilfs_ioctl_do_get_bdescs);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int nilfs_ioctl_move_inode_block(struct inode *inode,
+					struct nilfs_vdesc *vdesc,
+					struct list_head *buffers)
+{
+	struct buffer_head *bh;
+	int ret;
+
+	if (vdesc->vd_flags == 0)
+		ret = nilfs_gccache_submit_read_data(
+			inode, vdesc->vd_offset, vdesc->vd_blocknr,
+			vdesc->vd_vblocknr, &bh);
+	else
+		ret = nilfs_gccache_submit_read_node(
+			inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
+
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			printk(KERN_CRIT
+			       "%s: invalid virtual block address (%s): "
+			       "ino=%llu, cno=%llu, offset=%llu, "
+			       "blocknr=%llu, vblocknr=%llu\n",
+			       __func__, vdesc->vd_flags ? "node" : "data",
+			       (unsigned long long)vdesc->vd_ino,
+			       (unsigned long long)vdesc->vd_cno,
+			       (unsigned long long)vdesc->vd_offset,
+			       (unsigned long long)vdesc->vd_blocknr,
+			       (unsigned long long)vdesc->vd_vblocknr);
+		return ret;
+	}
+	bh->b_private = vdesc;
+	list_add_tail(&bh->b_assoc_buffers, buffers);
+	return 0;
+}
+
+static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+				   struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct inode *inode;
+	struct nilfs_vdesc *vdesc;
+	struct buffer_head *bh, *n;
+	LIST_HEAD(buffers);
+	ino_t ino;
+	__u64 cno;
+	int i, ret;
+
+	for (i = 0, vdesc = buf; i < nmembs; ) {
+		ino = vdesc->vd_ino;
+		cno = vdesc->vd_cno;
+		inode = nilfs_gc_iget(nilfs, ino, cno);
+		if (unlikely(inode == NULL)) {
+			ret = -ENOMEM;
+			goto failed;
+		}
+		do {
+			ret = nilfs_ioctl_move_inode_block(inode, vdesc,
+							   &buffers);
+			if (unlikely(ret < 0))
+				goto failed;
+			vdesc++;
+		} while (++i < nmembs &&
+			 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+	}
+
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		ret = nilfs_gccache_wait_and_mark_dirty(bh);
+		if (unlikely(ret < 0)) {
+			if (ret == -EEXIST) {
+				vdesc = bh->b_private;
+				printk(KERN_CRIT
+				       "%s: conflicting %s buffer: "
+				       "ino=%llu, cno=%llu, offset=%llu, "
+				       "blocknr=%llu, vblocknr=%llu\n",
+				       __func__,
+				       vdesc->vd_flags ? "node" : "data",
+				       (unsigned long long)vdesc->vd_ino,
+				       (unsigned long long)vdesc->vd_cno,
+				       (unsigned long long)vdesc->vd_offset,
+				       (unsigned long long)vdesc->vd_blocknr,
+				       (unsigned long long)vdesc->vd_vblocknr);
+			}
+			goto failed;
+		}
+		list_del_init(&bh->b_assoc_buffers);
+		bh->b_private = NULL;
+		brelse(bh);
+	}
+	return nmembs;
+
+ failed:
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		bh->b_private = NULL;
+		brelse(bh);
+	}
+	return ret;
+}
+
+static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
+					  struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct inode *cpfile = nilfs->ns_cpfile;
+	struct nilfs_period *periods = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_cpfile_delete_checkpoints(
+			cpfile, periods[i].p_start, periods[i].p_end);
+		if (ret < 0)
+			return ret;
+	}
+	return nmembs;
+}
+
+static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
+				      struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	int ret;
+
+	ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+
+	return (ret < 0) ? ret : nmembs;
+}
+
+static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
+					 struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct inode *dat = nilfs_dat_inode(nilfs);
+	struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		/* XXX: use macro or inline func to check liveness */
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				return ret;
+			bdescs[i].bd_blocknr = 0;
+		}
+		if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
+			/* skip dead block */
+			continue;
+		if (bdescs[i].bd_level == 0) {
+			ret = nilfs_mdt_mark_block_dirty(dat,
+							 bdescs[i].bd_offset);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		} else {
+			ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
+					      bdescs[i].bd_level);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		}
+	}
+	return nmembs;
+}
+
+static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
+				     struct nilfs_argv *argv, void *buf)
+{
+	size_t nmembs = argv->v_nmembs;
+	struct nilfs_sb_info *sbi = nilfs->ns_writer;
+	int ret;
+
+	if (unlikely(!sbi)) {
+		/* never happens because called for a writable mount */
+		WARN_ON(1);
+		return -EROFS;
+	}
+	ret = nilfs_segctor_add_segments_to_be_freed(
+		NILFS_SC(sbi), buf, nmembs);
+
+	return (ret < 0) ? ret : nmembs;
+}
+
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
+				       struct nilfs_argv *argv, void **kbufs)
+{
+	const char *msg;
+	int ret;
+
+	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+	if (ret < 0) {
+		msg = "cannot read source blocks";
+		goto failed;
+	}
+
+	ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because checkpoints can be removed
+		 * independently.
+		 */
+		msg = "cannot delete checkpoints";
+		goto failed;
+	}
+	ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because DAT file is updated atomically
+		 * using a copy-on-write technique.
+		 */
+		msg = "cannot delete virtual blocks from DAT file";
+		goto failed;
+	}
+	ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because the operation is nondestructive.
+		 */
+		msg = "cannot mark copying blocks dirty";
+		goto failed;
+	}
+	ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
+	if (ret < 0) {
+		/*
+		 * can safely abort because this operation is atomic.
+		 */
+		msg = "cannot set segments to be freed";
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	nilfs_remove_all_gcinode(nilfs);
+	printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
+	       msg, ret);
+	return ret;
+}
+
+static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
+				      unsigned int cmd, void __user *argp)
+{
+	struct nilfs_argv argv[5];
+	const static size_t argsz[5] = {
+		sizeof(struct nilfs_vdesc),
+		sizeof(struct nilfs_period),
+		sizeof(__u64),
+		sizeof(struct nilfs_bdesc),
+		sizeof(__u64),
+	};
+	void __user *base;
+	void *kbufs[5];
+	struct the_nilfs *nilfs;
+	size_t len, nsegs;
+	int n, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	nsegs = argv[4].v_nmembs;
+	if (argv[4].v_size != argsz[4])
+		return -EINVAL;
+	/*
+	 * argv[4] points to segment numbers this ioctl cleans.  We
+	 * use kmalloc() for its buffer because memory used for the
+	 * segment numbers is enough small.
+	 */
+	kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
+			       nsegs * sizeof(__u64));
+	if (IS_ERR(kbufs[4]))
+		return PTR_ERR(kbufs[4]);
+
+	nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+
+	for (n = 0; n < 4; n++) {
+		ret = -EINVAL;
+		if (argv[n].v_size != argsz[n])
+			goto out_free;
+
+		if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
+			goto out_free;
+
+		len = argv[n].v_size * argv[n].v_nmembs;
+		base = (void __user *)(unsigned long)argv[n].v_base;
+		if (len == 0) {
+			kbufs[n] = NULL;
+			continue;
+		}
+
+		kbufs[n] = vmalloc(len);
+		if (!kbufs[n]) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		if (copy_from_user(kbufs[n], base, len)) {
+			ret = -EFAULT;
+			vfree(kbufs[n]);
+			goto out_free;
+		}
+	}
+
+	ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+
+ out_free:
+	while (--n > 0)
+		vfree(kbufs[n]);
+	kfree(kbufs[4]);
+	return ret;
+}
+
+static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
+			    unsigned int cmd, void __user *argp)
+{
+	__u64 cno;
+	int ret;
+
+	ret = nilfs_construct_segment(inode->i_sb);
+	if (ret < 0)
+		return ret;
+
+	if (argp != NULL) {
+		cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
+		if (copy_to_user(argp, &cno, sizeof(cno)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
+				unsigned int cmd, void __user *argp,
+				size_t membsz,
+				ssize_t (*dofunc)(struct the_nilfs *,
+						  __u64 *, int,
+						  void *, size_t, size_t))
+
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	if (argv.v_size != membsz)
+		return -EINVAL;
+
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	void __user *argp = (void * __user *)arg;
+
+	switch (cmd) {
+	case NILFS_IOCTL_CHANGE_CPMODE:
+		return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
+	case NILFS_IOCTL_DELETE_CHECKPOINT:
+		return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_CPINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_cpinfo),
+					    nilfs_ioctl_do_get_cpinfo);
+	case NILFS_IOCTL_GET_CPSTAT:
+		return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_SUINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_suinfo),
+					    nilfs_ioctl_do_get_suinfo);
+	case NILFS_IOCTL_GET_SUSTAT:
+		return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_VINFO:
+		return nilfs_ioctl_get_info(inode, filp, cmd, argp,
+					    sizeof(struct nilfs_vinfo),
+					    nilfs_ioctl_do_get_vinfo);
+	case NILFS_IOCTL_GET_BDESCS:
+		return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
+	case NILFS_IOCTL_CLEAN_SEGMENTS:
+		return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
+	case NILFS_IOCTL_SYNC:
+		return nilfs_ioctl_sync(inode, filp, cmd, argp);
+	default:
+		return -ENOTTY;
+	}
+}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 000000000000..bb78745a0e30
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,564 @@
+/*
+ * mdt.c - meta data file for NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_MDT_MAX_RA_BLOCKS		(16 - 1)
+
+#define INIT_UNUSED_INODE_FIELDS
+
+static int
+nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
+			   struct buffer_head *bh,
+			   void (*init_block)(struct inode *,
+					      struct buffer_head *, void *))
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	void *kaddr;
+	int ret;
+
+	/* Caller exclude read accesses using page lock */
+
+	/* set_buffer_new(bh); */
+	bh->b_blocknr = 0;
+
+	ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
+	if (unlikely(ret))
+		return ret;
+
+	set_buffer_mapped(bh);
+
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+	if (init_block)
+		init_block(inode, bh, kaddr);
+	flush_dcache_page(bh->b_page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	set_buffer_uptodate(bh);
+	nilfs_mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	return 0;
+}
+
+static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
+				  struct buffer_head **out_bh,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *))
+{
+	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_transaction_info ti;
+	struct buffer_head *bh;
+	int err;
+
+	if (!sb) {
+		/*
+		 * Make sure this function is not called from any
+		 * read-only context.
+		 */
+		if (!nilfs->ns_writer) {
+			WARN_ON(1);
+			err = -EROFS;
+			goto out;
+		}
+		sb = nilfs->ns_writer->s_super;
+	}
+
+	nilfs_transaction_begin(sb, &ti, 0);
+
+	err = -ENOMEM;
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
+	if (unlikely(!bh))
+		goto failed_unlock;
+
+	err = -EEXIST;
+	if (buffer_uptodate(bh) || buffer_mapped(bh))
+		goto failed_bh;
+#if 0
+	/* The uptodate flag is not protected by the page lock, but
+	   the mapped flag is.  Thus, we don't have to wait the buffer. */
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		goto failed_bh;
+#endif
+
+	bh->b_bdev = nilfs->ns_bdev;
+	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
+	if (likely(!err)) {
+		get_bh(bh);
+		*out_bh = bh;
+	}
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+
+ failed_unlock:
+	if (likely(!err))
+		err = nilfs_transaction_commit(sb);
+	else
+		nilfs_transaction_abort(sb);
+ out:
+	return err;
+}
+
+static int
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
+		       int mode, struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	unsigned long blknum = 0;
+	int ret = -ENOMEM;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		goto failed;
+
+	ret = -EEXIST; /* internal code */
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (mode == READA) {
+		if (!trylock_buffer(bh)) {
+			ret = -EBUSY;
+			goto failed_bh;
+		}
+	} else /* mode == READ */
+		lock_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+	if (!buffer_mapped(bh)) { /* unused buffer */
+		ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
+					&blknum);
+		if (unlikely(ret)) {
+			unlock_buffer(bh);
+			goto failed_bh;
+		}
+		bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+		bh->b_blocknr = blknum;
+		set_buffer_mapped(bh);
+	}
+
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(mode, bh);
+	ret = 0;
+ out:
+	get_bh(bh);
+	*out_bh = bh;
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+ failed:
+	return ret;
+}
+
+static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
+				struct buffer_head **out_bh)
+{
+	struct buffer_head *first_bh, *bh;
+	unsigned long blkoff;
+	int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
+	int err;
+
+	err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
+	if (err == -EEXIST) /* internal code */
+		goto out;
+
+	if (unlikely(err))
+		goto failed;
+
+	blkoff = block + 1;
+	for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+		err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+		if (likely(!err || err == -EEXIST))
+			brelse(bh);
+		else if (err != -EBUSY)
+			break; /* abort readahead if bmap lookup failed */
+
+		if (!buffer_locked(first_bh))
+			goto out_no_wait;
+	}
+
+	wait_on_buffer(first_bh);
+
+ out_no_wait:
+	err = -EIO;
+	if (!buffer_uptodate(first_bh))
+		goto failed_bh;
+ out:
+	*out_bh = first_bh;
+	return 0;
+
+ failed_bh:
+	brelse(first_bh);
+ failed:
+	return err;
+}
+
+/**
+ * nilfs_mdt_get_block - read or create a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @blkoff: block offset
+ * @create: create flag
+ * @init_block: initializer used for newly allocated block
+ * @out_bh: output of a pointer to the buffer_head
+ *
+ * nilfs_mdt_get_block() looks up the specified buffer and tries to create
+ * a new buffer if @create is not zero.  On success, the returned buffer is
+ * assured to be either existing or formatted using a buffer lock on success.
+ * @out_bh is substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ *
+ * %-EROFS - Read only filesystem (for create mode)
+ */
+int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **out_bh)
+{
+	int ret;
+
+	/* Should be rewritten with merging nilfs_mdt_read_block() */
+ retry:
+	ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+	if (!create || ret != -ENOENT)
+		return ret;
+
+	ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
+	if (unlikely(ret == -EEXIST)) {
+		/* create = 0; */  /* limit read-create loop retries */
+		goto retry;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_mdt_delete_block - make a hole on the meta data file.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, zero is returned.
+ * On error, one of the following negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	err = nilfs_bmap_delete(ii->i_bmap, block);
+	if (!err || err == -ENOENT) {
+		nilfs_mdt_mark_dirty(inode);
+		nilfs_mdt_forget_block(inode, block);
+	}
+	return err;
+}
+
+/**
+ * nilfs_mdt_forget_block - discard dirty state and try to remove the page
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
+ * tries to release the page including the buffer from a page cache.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EBUSY - page has an active buffer.
+ *
+ * %-ENOENT - page cache has no page addressed by the offset.
+ */
+int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
+{
+	pgoff_t index = (pgoff_t)block >>
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	struct page *page;
+	unsigned long first_block;
+	int ret = 0;
+	int still_dirty;
+
+	page = find_lock_page(inode->i_mapping, index);
+	if (!page)
+		return -ENOENT;
+
+	wait_on_page_writeback(page);
+
+	first_block = (unsigned long)index <<
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh;
+
+		bh = nilfs_page_get_nth_block(page, block - first_block);
+		nilfs_forget_buffer(bh);
+	}
+	still_dirty = PageDirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (still_dirty ||
+	    invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
+		ret = -EBUSY;
+	return ret;
+}
+
+/**
+ * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
+{
+	struct buffer_head *bh;
+	int err;
+
+	err = nilfs_mdt_read_block(inode, block, &bh);
+	if (unlikely(err))
+		return err;
+	nilfs_mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	brelse(bh);
+	return 0;
+}
+
+int nilfs_mdt_fetch_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
+		set_bit(NILFS_I_DIRTY, &ii->i_state);
+		return 1;
+	}
+	return test_bit(NILFS_I_DIRTY, &ii->i_state);
+}
+
+static int
+nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = container_of(page->mapping,
+					   struct inode, i_data);
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_sb_info *writer = NULL;
+	int err = 0;
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	if (page->mapping->assoc_mapping)
+		return 0; /* Do not request flush for shadow page cache */
+	if (!sb) {
+		writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+		if (!writer)
+			return -EROFS;
+		sb = writer->s_super;
+	}
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_segment(sb);
+	else if (wbc->for_reclaim)
+		nilfs_flush_segment(sb, inode->i_ino);
+
+	if (writer)
+		nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+	return err;
+}
+
+
+static struct address_space_operations def_mdt_aops = {
+	.writepage		= nilfs_mdt_write_page,
+};
+
+static struct inode_operations def_mdt_iops;
+static struct file_operations def_mdt_fops;
+
+/*
+ * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
+ * ifile, or gcinodes.  This allows the B-tree code and segment constructor
+ * to treat them like regular files, and this helps to simplify the
+ * implementation.
+ *   On the other hand, some of the pseudo inodes have an irregular point:
+ * They don't have valid inode->i_sb pointer because their lifetimes are
+ * longer than those of the super block structs; they may continue for
+ * several consecutive mounts/umounts.  This would need discussions.
+ */
+struct inode *
+nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
+		     ino_t ino, gfp_t gfp_mask)
+{
+	struct inode *inode = nilfs_alloc_inode(sb);
+
+	if (!inode)
+		return NULL;
+	else {
+		struct address_space * const mapping = &inode->i_data;
+		struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+
+		if (!mi) {
+			nilfs_destroy_inode(inode);
+			return NULL;
+		}
+		mi->mi_nilfs = nilfs;
+		init_rwsem(&mi->mi_sem);
+
+		inode->i_sb = sb; /* sb may be NULL for some meta data files */
+		inode->i_blkbits = nilfs->ns_blocksize_bits;
+		inode->i_flags = 0;
+		atomic_set(&inode->i_count, 1);
+		inode->i_nlink = 1;
+		inode->i_ino = ino;
+		inode->i_mode = S_IFREG;
+		inode->i_private = mi;
+
+#ifdef INIT_UNUSED_INODE_FIELDS
+		atomic_set(&inode->i_writecount, 0);
+		inode->i_size = 0;
+		inode->i_blocks = 0;
+		inode->i_bytes = 0;
+		inode->i_generation = 0;
+#ifdef CONFIG_QUOTA
+		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+#endif
+		inode->i_pipe = NULL;
+		inode->i_bdev = NULL;
+		inode->i_cdev = NULL;
+		inode->i_rdev = 0;
+#ifdef CONFIG_SECURITY
+		inode->i_security = NULL;
+#endif
+		inode->dirtied_when = 0;
+
+		INIT_LIST_HEAD(&inode->i_list);
+		INIT_LIST_HEAD(&inode->i_sb_list);
+		inode->i_state = 0;
+#endif
+
+		spin_lock_init(&inode->i_lock);
+		mutex_init(&inode->i_mutex);
+		init_rwsem(&inode->i_alloc_sem);
+
+		mapping->host = NULL;  /* instead of inode */
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, gfp_mask);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = nilfs->ns_bdi;
+
+		inode->i_mapping = mapping;
+	}
+
+	return inode;
+}
+
+struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
+			    ino_t ino, gfp_t gfp_mask)
+{
+	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+
+	if (!inode)
+		return NULL;
+
+	inode->i_op = &def_mdt_iops;
+	inode->i_fop = &def_mdt_fops;
+	inode->i_mapping->a_ops = &def_mdt_aops;
+	return inode;
+}
+
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
+			      unsigned header_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_entry_size = entry_size;
+	mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
+}
+
+void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
+{
+	shadow->i_mapping->assoc_mapping = orig->i_mapping;
+	NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
+		&NILFS_I(orig)->i_btnode_cache;
+}
+
+void nilfs_mdt_clear(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	invalidate_mapping_pages(inode->i_mapping, 0, -1);
+	truncate_inode_pages(inode->i_mapping, 0);
+
+	nilfs_bmap_clear(ii->i_bmap);
+	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+
+void nilfs_mdt_destroy(struct inode *inode)
+{
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+	kfree(mdi);
+	nilfs_destroy_inode(inode);
+}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 000000000000..df683e0bca6a
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
+/*
+ * mdt.h - NILFS meta data file prototype and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_MDT_H
+#define _NILFS_MDT_H
+
+#include <linux/buffer_head.h>
+#include <linux/blockgroup_lock.h>
+#include "nilfs.h"
+#include "page.h"
+
+/**
+ * struct nilfs_mdt_info - on-memory private data of meta data files
+ * @mi_nilfs: back pointer to the_nilfs struct
+ * @mi_sem: reader/writer semaphore for meta data operations
+ * @mi_bgl: per-blockgroup locking
+ * @mi_entry_size: size of an entry
+ * @mi_first_entry_offset: offset to the first entry
+ * @mi_entries_per_block: number of entries in a block
+ * @mi_blocks_per_group: number of blocks in a group
+ * @mi_blocks_per_desc_block: number of blocks per descriptor block
+ */
+struct nilfs_mdt_info {
+	struct the_nilfs       *mi_nilfs;
+	struct rw_semaphore	mi_sem;
+	struct blockgroup_lock *mi_bgl;
+	unsigned		mi_entry_size;
+	unsigned		mi_first_entry_offset;
+	unsigned long		mi_entries_per_block;
+	unsigned long		mi_blocks_per_group;
+	unsigned long		mi_blocks_per_desc_block;
+};
+
+static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
+{
+	return inode->i_private;
+}
+
+static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
+}
+
+/* Default GFP flags using highmem */
+#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+
+int nilfs_mdt_get_block(struct inode *, unsigned long, int,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **);
+int nilfs_mdt_delete_block(struct inode *, unsigned long);
+int nilfs_mdt_forget_block(struct inode *, unsigned long);
+int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
+int nilfs_mdt_fetch_dirty(struct inode *);
+
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+			    gfp_t);
+struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
+				   ino_t, gfp_t);
+void nilfs_mdt_destroy(struct inode *);
+void nilfs_mdt_clear(struct inode *);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_set_shadow(struct inode *, struct inode *);
+
+
+#define nilfs_mdt_mark_buffer_dirty(bh)	nilfs_mark_buffer_dirty(bh)
+
+static inline void nilfs_mdt_mark_dirty(struct inode *inode)
+{
+	if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
+		set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline void nilfs_mdt_clear_dirty(struct inode *inode)
+{
+	clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline __u64 nilfs_mdt_cno(struct inode *inode)
+{
+	return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+}
+
+#define nilfs_mdt_bgl_lock(inode, bg) \
+	(&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+
+
+static inline int
+nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
+			    unsigned n)
+{
+	return nilfs_read_inode_common(
+		inode, (struct nilfs_inode *)(bh->b_data + n));
+}
+
+static inline void
+nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
+			     unsigned n)
+{
+	nilfs_write_inode_common(
+		inode, (struct nilfs_inode *)(bh->b_data + n), 1);
+}
+
+#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 000000000000..df70dadb336f
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
+/*
+ * namei.c - NILFS pathname lookup operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
+ *                       Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/pagemap.h>
+#include "nilfs.h"
+
+
+static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = nilfs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * Methods themselves.
+ */
+
+static struct dentry *
+nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > NILFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = nilfs_inode_by_name(dir, dentry);
+	inode = NULL;
+	if (ino) {
+		inode = nilfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+struct dentry *nilfs_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct inode *inode;
+	struct dentry dotdot;
+
+	dotdot.d_name.name = "..";
+	dotdot.d_name.len = 2;
+
+	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+
+	inode = nilfs_iget(child->d_inode->i_sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	return d_obtain_alias(inode);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+		mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int
+nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = dir->i_sb;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	int err;
+
+	if (l > sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	/* slow symlink */
+	inode->i_op = &nilfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+	err = page_symlink(inode, symname, l);
+	if (err)
+		goto out_fail;
+
+	/* mark_inode_dirty(inode); */
+	/* nilfs_new_inode() and page_symlink() do this */
+
+	err = nilfs_add_nondir(dentry, inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (inode->i_nlink >= NILFS_LINK_MAX)
+		return -EMLINK;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	atomic_inc(&inode->i_count);
+
+	err = nilfs_add_nondir(dentry, inode);
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (dir->i_nlink >= NILFS_LINK_MAX)
+		return -EMLINK;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode_inc_link_count(dir);
+
+	inode = nilfs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &nilfs_dir_inode_operations;
+	inode->i_fop = &nilfs_dir_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = nilfs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = nilfs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = -ENOENT;
+	de = nilfs_find_entry(dir, dentry, &page);
+	if (!de)
+		goto out;
+
+	inode = dentry->d_inode;
+	err = -EIO;
+	if (le64_to_cpu(de->inode) != inode->i_ino)
+		goto out;
+
+	if (!inode->i_nlink) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "deleting nonexistent file (%lu), %d\n",
+			      inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	err = nilfs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = -ENOTEMPTY;
+	if (nilfs_empty_dir(inode)) {
+		err = nilfs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir,	struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct nilfs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct nilfs_dir_entry *old_de;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
+	if (unlikely(err))
+		return err;
+
+	err = -ENOENT;
+	old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = nilfs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct nilfs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !nilfs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		inode_inc_link_count(old_inode);
+		nilfs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= NILFS_LINK_MAX)
+				goto out_dir;
+		}
+		inode_inc_link_count(old_inode);
+		err = nilfs_add_link(new_dentry, old_inode);
+		if (err) {
+			inode_dec_link_count(old_inode);
+			goto out_dir;
+		}
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 * inode_dec_link_count() will mark the inode dirty.
+	 */
+	old_inode->i_ctime = CURRENT_TIME;
+
+	nilfs_delete_entry(old_de, old_page);
+	inode_dec_link_count(old_inode);
+
+	if (dir_de) {
+		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+
+	err = nilfs_transaction_commit(old_dir->i_sb);
+	return err;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	nilfs_transaction_abort(old_dir->i_sb);
+	return err;
+}
+
+struct inode_operations nilfs_dir_inode_operations = {
+	.create		= nilfs_create,
+	.lookup		= nilfs_lookup,
+	.link		= nilfs_link,
+	.unlink		= nilfs_unlink,
+	.symlink	= nilfs_symlink,
+	.mkdir		= nilfs_mkdir,
+	.rmdir		= nilfs_rmdir,
+	.mknod		= nilfs_mknod,
+	.rename		= nilfs_rename,
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+};
+
+struct inode_operations nilfs_special_inode_operations = {
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+};
+
+struct inode_operations nilfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000000..da6fc0bba2e5
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,314 @@
+/*
+ * nilfs.h - NILFS local header file.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_H
+#define _NILFS_H
+
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/nilfs2_fs.h>
+#include "the_nilfs.h"
+#include "sb.h"
+#include "bmap.h"
+#include "bmap_union.h"
+
+/*
+ * nilfs inode data in memory
+ */
+struct nilfs_inode_info {
+	__u32 i_flags;
+	unsigned long  i_state;		/* Dynamic state flags */
+	struct nilfs_bmap *i_bmap;
+	union nilfs_bmap_union i_bmap_union;
+	__u64 i_xattr;	/* sector_t ??? */
+	__u32 i_dir_start_lookup;
+	__u64 i_cno;		/* check point number for GC inode */
+	struct address_space i_btnode_cache;
+	struct list_head i_dirty;	/* List for connecting dirty files */
+
+#ifdef CONFIG_NILFS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_NILFS_POSIX_ACL
+	struct posix_acl *i_acl;
+	struct posix_acl *i_default_acl;
+#endif
+	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
+					   disk inode */
+	struct inode vfs_inode;
+};
+
+static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
+{
+	return container_of(inode, struct nilfs_inode_info, vfs_inode);
+}
+
+static inline struct nilfs_inode_info *
+NILFS_BMAP_I(const struct nilfs_bmap *bmap)
+{
+	return container_of((union nilfs_bmap_union *)bmap,
+			    struct nilfs_inode_info,
+			    i_bmap_union);
+}
+
+static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
+{
+	struct nilfs_inode_info *ii =
+		container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
+	return &ii->vfs_inode;
+}
+
+static inline struct inode *NILFS_AS_I(struct address_space *mapping)
+{
+	return (mapping->host) ? :
+		container_of(mapping, struct inode, i_data);
+}
+
+/*
+ * Dynamic state flags of NILFS on-memory inode (i_state)
+ */
+enum {
+	NILFS_I_NEW = 0,		/* Inode is newly created */
+	NILFS_I_DIRTY,			/* The file is dirty */
+	NILFS_I_QUEUED,			/* inode is in dirty_files list */
+	NILFS_I_BUSY,			/* inode is grabbed by a segment
+					   constructor */
+	NILFS_I_COLLECTED,		/* All dirty blocks are collected */
+	NILFS_I_UPDATED,		/* The file has been written back */
+	NILFS_I_INODE_DIRTY,		/* write_inode is requested */
+	NILFS_I_BMAP,			/* has bmap and btnode_cache */
+	NILFS_I_GCINODE,		/* inode for GC, on memory only */
+	NILFS_I_GCDAT,			/* shadow DAT, on memory only */
+};
+
+/*
+ * Macros to check inode numbers
+ */
+#define NILFS_MDT_INO_BITS   \
+  ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |		\
+		  1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |	\
+		  1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+
+#define NILFS_SYS_INO_BITS   \
+  ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+
+#define NILFS_FIRST_INO(sb)  (NILFS_SB(sb)->s_nilfs->ns_first_ino)
+
+#define NILFS_MDT_INODE(sb, ino) \
+  ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+#define NILFS_VALID_INODE(sb, ino) \
+  ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+
+/**
+ * struct nilfs_transaction_info: context information for synchronization
+ * @ti_magic: Magic number
+ * @ti_save: Backup of journal_info field of task_struct
+ * @ti_flags: Flags
+ * @ti_count: Nest level
+ * @ti_garbage:	List of inode to be put when releasing semaphore
+ */
+struct nilfs_transaction_info {
+	u32			ti_magic;
+	void		       *ti_save;
+				/* This should never used. If this happens,
+				   one of other filesystems has a bug. */
+	unsigned short		ti_flags;
+	unsigned short		ti_count;
+	struct list_head	ti_garbage;
+};
+
+/* ti_magic */
+#define NILFS_TI_MAGIC		0xd9e392fb
+
+/* ti_flags */
+#define NILFS_TI_DYNAMIC_ALLOC	0x0001  /* Allocated from slab */
+#define NILFS_TI_SYNC		0x0002	/* Force to construct segment at the
+					   end of transaction. */
+#define NILFS_TI_GC		0x0004	/* GC context */
+#define NILFS_TI_COMMIT		0x0008	/* Change happened or not */
+#define NILFS_TI_WRITER		0x0010	/* Constructor context */
+
+
+int nilfs_transaction_begin(struct super_block *,
+			    struct nilfs_transaction_info *, int);
+int nilfs_transaction_commit(struct super_block *);
+void nilfs_transaction_abort(struct super_block *);
+
+static inline void nilfs_set_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	ti->ti_flags |= flag;
+}
+
+static inline int nilfs_test_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
+		return 0;
+	return !!(ti->ti_flags & flag);
+}
+
+static inline int nilfs_doing_gc(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_GC);
+}
+
+static inline int nilfs_doing_construction(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_WRITER);
+}
+
+static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
+{
+	return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
+}
+
+/*
+ * function prototype
+ */
+#ifdef CONFIG_NILFS_POSIX_ACL
+#error "NILFS: not yet supported POSIX ACL"
+extern int nilfs_permission(struct inode *, int, struct nameidata *);
+extern int nilfs_acl_chmod(struct inode *);
+extern int nilfs_init_acl(struct inode *, struct inode *);
+#else
+#define nilfs_permission   NULL
+
+static inline int nilfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	inode->i_mode &= ~current_umask();
+	return 0;
+}
+#endif
+
+#define NILFS_ATIME_DISABLE
+
+/* dir.c */
+extern int nilfs_add_link(struct dentry *, struct inode *);
+extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
+extern int nilfs_make_empty(struct inode *, struct inode *);
+extern struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *, struct dentry *, struct page **);
+extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
+extern int nilfs_empty_dir(struct inode *);
+extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
+extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+			   struct page *, struct inode *);
+
+/* file.c */
+extern int nilfs_sync_file(struct file *, struct dentry *, int);
+
+/* ioctl.c */
+long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
+				       void **);
+
+/* inode.c */
+extern struct inode *nilfs_new_inode(struct inode *, int);
+extern void nilfs_free_inode(struct inode *);
+extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern void nilfs_set_inode_flags(struct inode *);
+extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
+extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *);
+extern void nilfs_truncate(struct inode *);
+extern void nilfs_delete_inode(struct inode *);
+extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+				  struct buffer_head **);
+extern int nilfs_inode_dirty(struct inode *);
+extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+				unsigned);
+extern int nilfs_mark_inode_dirty(struct inode *);
+extern void nilfs_dirty_inode(struct inode *);
+
+/* namei.c */
+extern struct dentry *nilfs_get_parent(struct dentry *);
+
+/* super.c */
+extern struct inode *nilfs_alloc_inode(struct super_block *);
+extern void nilfs_destroy_inode(struct inode *);
+extern void nilfs_error(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+extern struct nilfs_super_block *
+nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
+extern int nilfs_store_magic_and_option(struct super_block *,
+					struct nilfs_super_block *, char *);
+extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
+extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
+
+/* gcinode.c */
+int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
+int nilfs_init_gccache(struct the_nilfs *);
+void nilfs_destroy_gccache(struct the_nilfs *);
+void nilfs_clear_gcinode(struct inode *);
+struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
+void nilfs_remove_all_gcinode(struct the_nilfs *);
+
+/* gcdat.c */
+int nilfs_init_gcdat_inode(struct the_nilfs *);
+void nilfs_commit_gcdat_inode(struct the_nilfs *);
+void nilfs_clear_gcdat_inode(struct the_nilfs *);
+
+/*
+ * Inodes and files operations
+ */
+extern struct file_operations nilfs_dir_operations;
+extern struct inode_operations nilfs_file_inode_operations;
+extern struct file_operations nilfs_file_operations;
+extern struct address_space_operations nilfs_aops;
+extern struct inode_operations nilfs_dir_inode_operations;
+extern struct inode_operations nilfs_special_inode_operations;
+extern struct inode_operations nilfs_symlink_inode_operations;
+
+/*
+ * filesystem type
+ */
+extern struct file_system_type nilfs_fs_type;
+
+
+#endif	/* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 000000000000..a2692bbc7b50
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,541 @@
+/*
+ * page.c - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_BUFFER_INHERENT_BITS  \
+	((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
+	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
+
+static struct buffer_head *
+__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
+		       int blkbits, unsigned long b_state)
+
+{
+	unsigned long first_block;
+	struct buffer_head *bh;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, b_state);
+
+	first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+	bh = nilfs_page_get_nth_block(page, block - first_block);
+
+	touch_buffer(bh);
+	wait_on_buffer(bh);
+	return bh;
+}
+
+/*
+ * Since the page cache of B-tree node pages or data page cache of pseudo
+ * inodes does not have a valid mapping->host pointer, calling
+ * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
+ * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
+ * To avoid this problem, the old style mark_buffer_dirty() is used instead.
+ */
+void nilfs_mark_buffer_dirty(struct buffer_head *bh)
+{
+	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
+		__set_page_dirty_nobuffers(bh->b_page);
+}
+
+struct buffer_head *nilfs_grab_buffer(struct inode *inode,
+				      struct address_space *mapping,
+				      unsigned long blkoff,
+				      unsigned long b_state)
+{
+	int blkbits = inode->i_blkbits;
+	pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+	struct page *page, *opage;
+	struct buffer_head *bh, *obh;
+
+	page = grab_cache_page(mapping, index);
+	if (unlikely(!page))
+		return NULL;
+
+	bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+	if (unlikely(!bh)) {
+		unlock_page(page);
+		page_cache_release(page);
+		return NULL;
+	}
+	if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
+		/*
+		 * Shadow page cache uses assoc_mapping to point its original
+		 * page cache.  The following code tries the original cache
+		 * if the given cache is a shadow and it didn't hit.
+		 */
+		opage = find_lock_page(mapping->assoc_mapping, index);
+		if (!opage)
+			return bh;
+
+		obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
+					     b_state);
+		if (buffer_uptodate(obh)) {
+			nilfs_copy_buffer(bh, obh);
+			if (buffer_dirty(obh)) {
+				nilfs_mark_buffer_dirty(bh);
+				if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
+					nilfs_mdt_mark_dirty(inode);
+			}
+		}
+		brelse(obh);
+		unlock_page(opage);
+		page_cache_release(opage);
+	}
+	return bh;
+}
+
+/**
+ * nilfs_forget_buffer - discard dirty state
+ * @inode: owner inode of the buffer
+ * @bh: buffer head of the buffer to be discarded
+ */
+void nilfs_forget_buffer(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+
+	lock_buffer(bh);
+	clear_buffer_nilfs_volatile(bh);
+	clear_buffer_dirty(bh);
+	if (nilfs_page_buffers_clean(page))
+		__nilfs_clear_page_dirty(page);
+
+	clear_buffer_uptodate(bh);
+	clear_buffer_mapped(bh);
+	bh->b_blocknr = -1;
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	unlock_buffer(bh);
+	brelse(bh);
+}
+
+/**
+ * nilfs_copy_buffer -- copy buffer data and flags
+ * @dbh: destination buffer
+ * @sbh: source buffer
+ */
+void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
+{
+	void *kaddr0, *kaddr1;
+	unsigned long bits;
+	struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+	struct buffer_head *bh;
+
+	kaddr0 = kmap_atomic(spage, KM_USER0);
+	kaddr1 = kmap_atomic(dpage, KM_USER1);
+	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
+	kunmap_atomic(kaddr1, KM_USER1);
+	kunmap_atomic(kaddr0, KM_USER0);
+
+	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
+	dbh->b_blocknr = sbh->b_blocknr;
+	dbh->b_bdev = sbh->b_bdev;
+
+	bh = dbh;
+	bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+	while ((bh = bh->b_this_page) != dbh) {
+		lock_buffer(bh);
+		bits &= bh->b_state;
+		unlock_buffer(bh);
+	}
+	if (bits & (1UL << BH_Uptodate))
+		SetPageUptodate(dpage);
+	else
+		ClearPageUptodate(dpage);
+	if (bits & (1UL << BH_Mapped))
+		SetPageMappedToDisk(dpage);
+	else
+		ClearPageMappedToDisk(dpage);
+}
+
+/**
+ * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
+ * @page: page to be checked
+ *
+ * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
+ * Otherwise, it returns non-zero value.
+ */
+int nilfs_page_buffers_clean(struct page *page)
+{
+	struct buffer_head *bh, *head;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_dirty(bh))
+			return 0;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return 1;
+}
+
+void nilfs_page_bug(struct page *page)
+{
+	struct address_space *m;
+	unsigned long ino = 0;
+
+	if (unlikely(!page)) {
+		printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+		return;
+	}
+
+	m = page->mapping;
+	if (m) {
+		struct inode *inode = NILFS_AS_I(m);
+		if (inode != NULL)
+			ino = inode->i_ino;
+	}
+	printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+	       "mapping=%p ino=%lu\n",
+	       page, atomic_read(&page->_count),
+	       (unsigned long long)page->index, page->flags, m, ino);
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int i = 0;
+
+		bh = head = page_buffers(page);
+		do {
+			printk(KERN_CRIT
+			       " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
+			       i++, bh, atomic_read(&bh->b_count),
+			       (unsigned long long)bh->b_blocknr, bh->b_state);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+}
+
+/**
+ * nilfs_alloc_private_page - allocate a private page with buffer heads
+ *
+ * Return Value: On success, a pointer to the allocated page is returned.
+ * On error, NULL is returned.
+ */
+struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
+				      unsigned long state)
+{
+	struct buffer_head *bh, *head, *tail;
+	struct page *page;
+
+	page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
+	if (unlikely(!page))
+		return NULL;
+
+	lock_page(page);
+	head = alloc_page_buffers(page, size, 0);
+	if (unlikely(!head)) {
+		unlock_page(page);
+		__free_page(page);
+		return NULL;
+	}
+
+	bh = head;
+	do {
+		bh->b_state = (1UL << BH_NILFS_Allocated) | state;
+		tail = bh;
+		bh->b_bdev = bdev;
+		bh = bh->b_this_page;
+	} while (bh);
+
+	tail->b_this_page = head;
+	attach_page_buffers(page, head);
+
+	return page;
+}
+
+void nilfs_free_private_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	BUG_ON(page->mapping);
+
+	if (page_has_buffers(page) && !try_to_free_buffers(page))
+		NILFS_PAGE_BUG(page, "failed to free page");
+
+	unlock_page(page);
+	__free_page(page);
+}
+
+/**
+ * nilfs_copy_page -- copy the page with buffers
+ * @dst: destination page
+ * @src: source page
+ * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ *
+ * This fuction is for both data pages and btnode pages.  The dirty flag
+ * should be treated by caller.  The page must not be under i/o.
+ * Both src and dst page must be locked
+ */
+static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+{
+	struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
+	unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
+
+	BUG_ON(PageWriteback(dst));
+
+	sbh = sbufs = page_buffers(src);
+	if (!page_has_buffers(dst))
+		create_empty_buffers(dst, sbh->b_size, 0);
+
+	if (copy_dirty)
+		mask |= (1UL << BH_Dirty);
+
+	dbh = dbufs = page_buffers(dst);
+	do {
+		lock_buffer(sbh);
+		lock_buffer(dbh);
+		dbh->b_state = sbh->b_state & mask;
+		dbh->b_blocknr = sbh->b_blocknr;
+		dbh->b_bdev = sbh->b_bdev;
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+
+	copy_highpage(dst, src);
+
+	if (PageUptodate(src) && !PageUptodate(dst))
+		SetPageUptodate(dst);
+	else if (!PageUptodate(src) && PageUptodate(dst))
+		ClearPageUptodate(dst);
+	if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
+		SetPageMappedToDisk(dst);
+	else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
+		ClearPageMappedToDisk(dst);
+
+	do {
+		unlock_buffer(sbh);
+		unlock_buffer(dbh);
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+}
+
+int nilfs_copy_dirty_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+	int err = 0;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
+				PAGEVEC_SIZE))
+		return 0;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+
+		lock_page(page);
+		if (unlikely(!PageDirty(page)))
+			NILFS_PAGE_BUG(page, "inconsistent dirty state");
+
+		dpage = grab_cache_page(dmap, page->index);
+		if (unlikely(!dpage)) {
+			/* No empty page is added to the page cache */
+			err = -ENOMEM;
+			unlock_page(page);
+			break;
+		}
+		if (unlikely(!page_has_buffers(page)))
+			NILFS_PAGE_BUG(page,
+				       "found empty page in dat page cache");
+
+		nilfs_copy_page(dpage, page, 1);
+		__set_page_dirty_nobuffers(dpage);
+
+		unlock_page(dpage);
+		page_cache_release(dpage);
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+/**
+ * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * @dmap: destination page cache
+ * @smap: source page cache
+ *
+ * No pages must no be added to the cache during this process.
+ * This must be ensured by the caller.
+ */
+void nilfs_copy_back_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i, n;
+	pgoff_t index = 0;
+	int err;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+	if (!n)
+		return;
+	index = pvec.pages[n - 1]->index + 1;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+		pgoff_t offset = page->index;
+
+		lock_page(page);
+		dpage = find_lock_page(dmap, offset);
+		if (dpage) {
+			/* override existing page on the destination cache */
+			WARN_ON(PageDirty(dpage));
+			nilfs_copy_page(dpage, page, 0);
+			unlock_page(dpage);
+			page_cache_release(dpage);
+		} else {
+			struct page *page2;
+
+			/* move the page to the destination cache */
+			spin_lock_irq(&smap->tree_lock);
+			page2 = radix_tree_delete(&smap->page_tree, offset);
+			WARN_ON(page2 != page);
+
+			smap->nrpages--;
+			spin_unlock_irq(&smap->tree_lock);
+
+			spin_lock_irq(&dmap->tree_lock);
+			err = radix_tree_insert(&dmap->page_tree, offset, page);
+			if (unlikely(err < 0)) {
+				WARN_ON(err == -EEXIST);
+				page->mapping = NULL;
+				page_cache_release(page); /* for cache */
+			} else {
+				page->mapping = dmap;
+				dmap->nrpages++;
+				if (PageDirty(page))
+					radix_tree_tag_set(&dmap->page_tree,
+							   offset,
+							   PAGECACHE_TAG_DIRTY);
+			}
+			spin_unlock_irq(&dmap->tree_lock);
+		}
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	goto repeat;
+}
+
+void nilfs_clear_dirty_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			lock_page(page);
+			ClearPageUptodate(page);
+			ClearPageMappedToDisk(page);
+			bh = head = page_buffers(page);
+			do {
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				clear_buffer_nilfs_volatile(bh);
+				clear_buffer_uptodate(bh);
+				clear_buffer_mapped(bh);
+				unlock_buffer(bh);
+				bh = bh->b_this_page;
+			} while (bh != head);
+
+			__nilfs_clear_page_dirty(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+unsigned nilfs_page_count_clean_buffers(struct page *page,
+					unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	struct buffer_head *bh, *head;
+	unsigned nc = 0;
+
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + bh->b_size;
+		if (block_end > from && block_start < to && !buffer_dirty(bh))
+			nc++;
+	}
+	return nc;
+}
+
+/*
+ * NILFS2 needs clear_page_dirty() in the following two cases:
+ *
+ * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
+ *    page dirty flags when it copies back pages from the shadow cache
+ *    (gcdat->{i_mapping,i_btnode_cache}) to its original cache
+ *    (dat->{i_mapping,i_btnode_cache}).
+ *
+ * 2) Some B-tree operations like insertion or deletion may dispose buffers
+ *    in dirty state, and this needs to cancel the dirty state of their pages.
+ */
+int __nilfs_clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		spin_lock_irq(&mapping->tree_lock);
+		if (test_bit(PG_dirty, &page->flags)) {
+			radix_tree_tag_clear(&mapping->page_tree,
+					     page_index(page),
+					     PAGECACHE_TAG_DIRTY);
+			spin_unlock_irq(&mapping->tree_lock);
+			return clear_page_dirty_for_io(page);
+		}
+		spin_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 000000000000..8abca4d1c1f8
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
+/*
+ * page.h - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#ifndef _NILFS_PAGE_H
+#define _NILFS_PAGE_H
+
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+
+/*
+ * Extended buffer state bits
+ */
+enum {
+	BH_NILFS_Allocated = BH_PrivateStart,
+	BH_NILFS_Node,
+	BH_NILFS_Volatile,
+};
+
+BUFFER_FNS(NILFS_Allocated, nilfs_allocated)	/* nilfs private buffers */
+BUFFER_FNS(NILFS_Node, nilfs_node)		/* nilfs node buffers */
+BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+
+
+void nilfs_mark_buffer_dirty(struct buffer_head *bh);
+int __nilfs_clear_page_dirty(struct page *);
+
+struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
+				      unsigned long, unsigned long);
+void nilfs_forget_buffer(struct buffer_head *);
+void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
+int nilfs_page_buffers_clean(struct page *);
+void nilfs_page_bug(struct page *);
+struct page *nilfs_alloc_private_page(struct block_device *, int,
+				      unsigned long);
+void nilfs_free_private_page(struct page *);
+
+int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
+void nilfs_copy_back_pages(struct address_space *, struct address_space *);
+void nilfs_clear_dirty_pages(struct address_space *);
+unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+
+#define NILFS_PAGE_BUG(page, m, a...) \
+	do { nilfs_page_bug(page); BUG(); } while (0)
+
+static inline struct buffer_head *
+nilfs_page_get_nth_block(struct page *page, unsigned int count)
+{
+	struct buffer_head *bh = page_buffers(page);
+
+	while (count-- > 0)
+		bh = bh->b_this_page;
+	get_bh(bh);
+	return bh;
+}
+
+#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 000000000000..57afa9d24061
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,919 @@
+/*
+ * recovery.c - NILFS recovery logic
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "sufile.h"
+#include "page.h"
+#include "seglist.h"
+#include "segbuf.h"
+
+/*
+ * Segment check result
+ */
+enum {
+	NILFS_SEG_VALID,
+	NILFS_SEG_NO_SUPER_ROOT,
+	NILFS_SEG_FAIL_IO,
+	NILFS_SEG_FAIL_MAGIC,
+	NILFS_SEG_FAIL_SEQ,
+	NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
+	NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
+	NILFS_SEG_FAIL_CHECKSUM_FULL,
+	NILFS_SEG_FAIL_CONSISTENCY,
+};
+
+/* work structure for recovery */
+struct nilfs_recovery_block {
+	ino_t ino;		/* Inode number of the file that this block
+				   belongs to */
+	sector_t blocknr;	/* block number */
+	__u64 vblocknr;		/* virtual block number */
+	unsigned long blkoff;	/* File offset of the data block (per block) */
+	struct list_head list;
+};
+
+
+static int nilfs_warn_segment_error(int err)
+{
+	switch (err) {
+	case NILFS_SEG_FAIL_IO:
+		printk(KERN_WARNING
+		       "NILFS warning: I/O error on loading last segment\n");
+		return -EIO;
+	case NILFS_SEG_FAIL_MAGIC:
+		printk(KERN_WARNING
+		       "NILFS warning: Segment magic number invalid\n");
+		break;
+	case NILFS_SEG_FAIL_SEQ:
+		printk(KERN_WARNING
+		       "NILFS warning: Sequence number mismatch\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in segment summary\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in super root\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_FULL:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in segment payload\n");
+		break;
+	case NILFS_SEG_FAIL_CONSISTENCY:
+		printk(KERN_WARNING
+		       "NILFS warning: Inconsistent segment\n");
+		break;
+	case NILFS_SEG_NO_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: No super root in the last segment\n");
+		break;
+	}
+	return -EINVAL;
+}
+
+static void store_segsum_info(struct nilfs_segsum_info *ssi,
+			      struct nilfs_segment_summary *sum,
+			      unsigned int blocksize)
+{
+	ssi->flags = le16_to_cpu(sum->ss_flags);
+	ssi->seg_seq = le64_to_cpu(sum->ss_seq);
+	ssi->ctime = le64_to_cpu(sum->ss_create);
+	ssi->next = le64_to_cpu(sum->ss_next);
+	ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
+	ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
+	ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
+
+	ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
+	ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+}
+
+/**
+ * calc_crc_cont - check CRC of blocks continuously
+ * @sbi: nilfs_sb_info
+ * @bhs: buffer head of start block
+ * @sum: place to store result
+ * @offset: offset bytes in the first block
+ * @check_bytes: number of bytes to be checked
+ * @start: DBN of start block
+ * @nblock: number of blocks to be checked
+ */
+static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
+			 u32 *sum, unsigned long offset, u64 check_bytes,
+			 sector_t start, unsigned long nblock)
+{
+	unsigned long blocksize = sbi->s_super->s_blocksize;
+	unsigned long size;
+	u32 crc;
+
+	BUG_ON(offset >= blocksize);
+	check_bytes -= offset;
+	size = min_t(u64, check_bytes, blocksize - offset);
+	crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
+		       (unsigned char *)bhs->b_data + offset, size);
+	if (--nblock > 0) {
+		do {
+			struct buffer_head *bh
+				= sb_bread(sbi->s_super, ++start);
+			if (!bh)
+				return -EIO;
+			check_bytes -= size;
+			size = min_t(u64, check_bytes, blocksize);
+			crc = crc32_le(crc, bh->b_data, size);
+			brelse(bh);
+		} while (--nblock > 0);
+	}
+	*sum = crc;
+	return 0;
+}
+
+/**
+ * nilfs_read_super_root_block - read super root block
+ * @sb: super_block
+ * @sr_block: disk block number of the super root block
+ * @pbh: address of a buffer_head pointer to return super root buffer
+ * @check: CRC check flag
+ */
+int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
+				struct buffer_head **pbh, int check)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *sr;
+	u32 crc;
+	int ret;
+
+	*pbh = NULL;
+	bh_sr = sb_bread(sb, sr_block);
+	if (unlikely(!bh_sr)) {
+		ret = NILFS_SEG_FAIL_IO;
+		goto failed;
+	}
+
+	sr = (struct nilfs_super_root *)bh_sr->b_data;
+	if (check) {
+		unsigned bytes = le16_to_cpu(sr->sr_bytes);
+
+		if (bytes == 0 || bytes > sb->s_blocksize) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+		if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
+				  sizeof(sr->sr_sum), bytes, sr_block, 1)) {
+			ret = NILFS_SEG_FAIL_IO;
+			goto failed_bh;
+		}
+		if (crc != le32_to_cpu(sr->sr_sum)) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+	}
+	*pbh = bh_sr;
+	return 0;
+
+ failed_bh:
+	brelse(bh_sr);
+
+ failed:
+	return nilfs_warn_segment_error(ret);
+}
+
+/**
+ * load_segment_summary - read segment summary of the specified partial segment
+ * @sbi: nilfs_sb_info
+ * @pseg_start: start disk block number of partial segment
+ * @seg_seq: sequence number requested
+ * @ssi: pointer to nilfs_segsum_info struct to store information
+ * @full_check: full check flag
+ *              (0: only checks segment summary CRC, 1: data CRC)
+ */
+static int
+load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
+		     u64 seg_seq, struct nilfs_segsum_info *ssi,
+		     int full_check)
+{
+	struct buffer_head *bh_sum;
+	struct nilfs_segment_summary *sum;
+	unsigned long offset, nblock;
+	u64 check_bytes;
+	u32 crc, crc_sum;
+	int ret = NILFS_SEG_FAIL_IO;
+
+	bh_sum = sb_bread(sbi->s_super, pseg_start);
+	if (!bh_sum)
+		goto out;
+
+	sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+
+	/* Check consistency of segment summary */
+	if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
+		ret = NILFS_SEG_FAIL_MAGIC;
+		goto failed;
+	}
+	store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
+	if (seg_seq != ssi->seg_seq) {
+		ret = NILFS_SEG_FAIL_SEQ;
+		goto failed;
+	}
+	if (full_check) {
+		offset = sizeof(sum->ss_datasum);
+		check_bytes =
+			((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
+		nblock = ssi->nblocks;
+		crc_sum = le32_to_cpu(sum->ss_datasum);
+		ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+	} else { /* only checks segment summary */
+		offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
+		check_bytes = ssi->sumbytes;
+		nblock = ssi->nsumblk;
+		crc_sum = le32_to_cpu(sum->ss_sumsum);
+		ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
+	}
+
+	if (unlikely(nblock == 0 ||
+		     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
+		/* This limits the number of blocks read in the CRC check */
+		ret = NILFS_SEG_FAIL_CONSISTENCY;
+		goto failed;
+	}
+	if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
+			  pseg_start, nblock)) {
+		ret = NILFS_SEG_FAIL_IO;
+		goto failed;
+	}
+	if (crc == crc_sum)
+		ret = 0;
+ failed:
+	brelse(bh_sum);
+ out:
+	return ret;
+}
+
+static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
+			unsigned int *offset, unsigned int bytes)
+{
+	void *ptr;
+	sector_t blocknr;
+
+	BUG_ON((*pbh)->b_size < *offset);
+	if (bytes > (*pbh)->b_size - *offset) {
+		blocknr = (*pbh)->b_blocknr;
+		brelse(*pbh);
+		*pbh = sb_bread(sb, blocknr + 1);
+		if (unlikely(!*pbh))
+			return NULL;
+		*offset = 0;
+	}
+	ptr = (*pbh)->b_data + *offset;
+	*offset += bytes;
+	return ptr;
+}
+
+static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
+			unsigned int *offset, unsigned int bytes,
+			unsigned long count)
+{
+	unsigned int rest_item_in_current_block
+		= ((*pbh)->b_size - *offset) / bytes;
+
+	if (count <= rest_item_in_current_block) {
+		*offset += bytes * count;
+	} else {
+		sector_t blocknr = (*pbh)->b_blocknr;
+		unsigned int nitem_per_block = (*pbh)->b_size / bytes;
+		unsigned int bcnt;
+
+		count -= rest_item_in_current_block;
+		bcnt = DIV_ROUND_UP(count, nitem_per_block);
+		*offset = bytes * (count - (bcnt - 1) * nitem_per_block);
+
+		brelse(*pbh);
+		*pbh = sb_bread(sb, blocknr + bcnt);
+	}
+}
+
+static int
+collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
+			   struct nilfs_segsum_info *ssi,
+			   struct list_head *head)
+{
+	struct buffer_head *bh;
+	unsigned int offset;
+	unsigned long nfinfo = ssi->nfinfo;
+	sector_t blocknr = sum_blocknr + ssi->nsumblk;
+	ino_t ino;
+	int err = -EIO;
+
+	if (!nfinfo)
+		return 0;
+
+	bh = sb_bread(sbi->s_super, sum_blocknr);
+	if (unlikely(!bh))
+		goto out;
+
+	offset = le16_to_cpu(
+		((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
+	for (;;) {
+		unsigned long nblocks, ndatablk, nnodeblk;
+		struct nilfs_finfo *finfo;
+
+		finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
+		if (unlikely(!finfo))
+			goto out;
+
+		ino = le64_to_cpu(finfo->fi_ino);
+		nblocks = le32_to_cpu(finfo->fi_nblocks);
+		ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+		nnodeblk = nblocks - ndatablk;
+
+		while (ndatablk-- > 0) {
+			struct nilfs_recovery_block *rb;
+			struct nilfs_binfo_v *binfo;
+
+			binfo = segsum_get(sbi->s_super, &bh, &offset,
+					   sizeof(*binfo));
+			if (unlikely(!binfo))
+				goto out;
+
+			rb = kmalloc(sizeof(*rb), GFP_NOFS);
+			if (unlikely(!rb)) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb->ino = ino;
+			rb->blocknr = blocknr++;
+			rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
+			rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
+			/* INIT_LIST_HEAD(&rb->list); */
+			list_add_tail(&rb->list, head);
+		}
+		if (--nfinfo == 0)
+			break;
+		blocknr += nnodeblk; /* always 0 for the data sync segments */
+		segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
+			    nnodeblk);
+		if (unlikely(!bh))
+			goto out;
+	}
+	err = 0;
+ out:
+	brelse(bh);   /* brelse(NULL) is just ignored */
+	return err;
+}
+
+static void dispose_recovery_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_recovery_block *rb
+			= list_entry(head->next,
+				     struct nilfs_recovery_block, list);
+		list_del(&rb->list);
+		kfree(rb);
+	}
+}
+
+void nilfs_dispose_segment_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_segment_entry *ent
+			= list_entry(head->next,
+				     struct nilfs_segment_entry, list);
+		list_del(&ent->list);
+		nilfs_free_segment_entry(ent);
+	}
+}
+
+static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
+					      struct nilfs_sb_info *sbi,
+					      struct nilfs_recovery_info *ri)
+{
+	struct list_head *head = &ri->ri_used_segments;
+	struct nilfs_segment_entry *ent, *n;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 segnum[4];
+	int err;
+	int i;
+
+	segnum[0] = nilfs->ns_segnum;
+	segnum[1] = nilfs->ns_nextnum;
+	segnum[2] = ri->ri_segnum;
+	segnum[3] = ri->ri_nextnum;
+
+	nilfs_attach_writer(nilfs, sbi);
+	/*
+	 * Releasing the next segment of the latest super root.
+	 * The next segment is invalidated by this recovery.
+	 */
+	err = nilfs_sufile_free(sufile, segnum[1]);
+	if (unlikely(err))
+		goto failed;
+
+	err = -ENOMEM;
+	for (i = 1; i < 4; i++) {
+		ent = nilfs_alloc_segment_entry(segnum[i]);
+		if (unlikely(!ent))
+			goto failed;
+		list_add_tail(&ent->list, head);
+	}
+
+	/*
+	 * Collecting segments written after the latest super root.
+	 * These are marked dirty to avoid being reallocated in the next write.
+	 */
+	list_for_each_entry_safe(ent, n, head, list) {
+		if (ent->segnum != segnum[0]) {
+			err = nilfs_sufile_scrap(sufile, ent->segnum);
+			if (unlikely(err))
+				goto failed;
+		}
+		list_del(&ent->list);
+		nilfs_free_segment_entry(ent);
+	}
+
+	/* Allocate new segments for recovery */
+	err = nilfs_sufile_alloc(sufile, &segnum[0]);
+	if (unlikely(err))
+		goto failed;
+
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq = ri->ri_seq + 2;
+	nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
+
+ failed:
+	/* No need to recover sufile because it will be destroyed on error */
+	nilfs_detach_writer(nilfs, sbi);
+	return err;
+}
+
+static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
+				     struct nilfs_recovery_block *rb,
+				     struct page *page)
+{
+	struct buffer_head *bh_org;
+	void *kaddr;
+
+	bh_org = sb_bread(sbi->s_super, rb->blocknr);
+	if (unlikely(!bh_org))
+		return -EIO;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh_org);
+	return 0;
+}
+
+static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
+				struct list_head *head,
+				unsigned long *nr_salvaged_blocks)
+{
+	struct inode *inode;
+	struct nilfs_recovery_block *rb, *n;
+	unsigned blocksize = sbi->s_super->s_blocksize;
+	struct page *page;
+	loff_t pos;
+	int err = 0, err2 = 0;
+
+	list_for_each_entry_safe(rb, n, head, list) {
+		inode = nilfs_iget(sbi->s_super, rb->ino);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			inode = NULL;
+			goto failed_inode;
+		}
+
+		pos = rb->blkoff << inode->i_blkbits;
+		page = NULL;
+		err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
+					0, &page, NULL, nilfs_get_block);
+		if (unlikely(err))
+			goto failed_inode;
+
+		err = nilfs_recovery_copy_block(sbi, rb, page);
+		if (unlikely(err))
+			goto failed_page;
+
+		err = nilfs_set_file_dirty(sbi, inode, 1);
+		if (unlikely(err))
+			goto failed_page;
+
+		block_write_end(NULL, inode->i_mapping, pos, blocksize,
+				blocksize, page, NULL);
+
+		unlock_page(page);
+		page_cache_release(page);
+
+		(*nr_salvaged_blocks)++;
+		goto next;
+
+ failed_page:
+		unlock_page(page);
+		page_cache_release(page);
+
+ failed_inode:
+		printk(KERN_WARNING
+		       "NILFS warning: error recovering data block "
+		       "(err=%d, ino=%lu, block-offset=%llu)\n",
+		       err, rb->ino, (unsigned long long)rb->blkoff);
+		if (!err2)
+			err2 = err;
+ next:
+		iput(inode); /* iput(NULL) is just ignored */
+		list_del_init(&rb->list);
+		kfree(rb);
+	}
+	return err2;
+}
+
+/**
+ * nilfs_do_roll_forward - salvage logical segments newer than the latest
+ * checkpoint
+ * @sbi: nilfs_sb_info
+ * @nilfs: the_nilfs
+ * @ri: pointer to a nilfs_recovery_info
+ */
+static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
+				 struct nilfs_sb_info *sbi,
+				 struct nilfs_recovery_info *ri)
+{
+	struct nilfs_segsum_info ssi;
+	sector_t pseg_start;
+	sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
+	unsigned long nsalvaged_blocks = 0;
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	int empty_seg = 0;
+	int err = 0, ret;
+	LIST_HEAD(dsync_blocks);  /* list of data blocks to be recovered */
+	enum {
+		RF_INIT_ST,
+		RF_DSYNC_ST,   /* scanning data-sync segments */
+	};
+	int state = RF_INIT_ST;
+
+	nilfs_attach_writer(nilfs, sbi);
+	pseg_start = ri->ri_lsegs_start;
+	seg_seq = ri->ri_lsegs_start_seq;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO) {
+				err = -EIO;
+				goto failed;
+			}
+			goto strayed;
+		}
+		if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
+			goto confused;
+
+		/* Found a valid partial segment; do recovery actions */
+		nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+		empty_seg = 0;
+		nilfs->ns_ctime = ssi.ctime;
+		if (!(ssi.flags & NILFS_SS_GC))
+			nilfs->ns_nongc_ctime = ssi.ctime;
+
+		switch (state) {
+		case RF_INIT_ST:
+			if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
+				goto try_next_pseg;
+			state = RF_DSYNC_ST;
+			/* Fall through */
+		case RF_DSYNC_ST:
+			if (!NILFS_SEG_DSYNC(&ssi))
+				goto confused;
+
+			err = collect_blocks_from_segsum(
+				sbi, pseg_start, &ssi, &dsync_blocks);
+			if (unlikely(err))
+				goto failed;
+			if (NILFS_SEG_LOGEND(&ssi)) {
+				err = recover_dsync_blocks(
+					sbi, &dsync_blocks, &nsalvaged_blocks);
+				if (unlikely(err))
+					goto failed;
+				state = RF_INIT_ST;
+			}
+			break; /* Fall through to try_next_pseg */
+		}
+
+ try_next_pseg:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+		pseg_start += ssi.nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			break;
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+	if (nsalvaged_blocks) {
+		printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
+		       sbi->s_super->s_id, nsalvaged_blocks);
+		ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
+	}
+ out:
+	dispose_recovery_list(&dsync_blocks);
+	nilfs_detach_writer(sbi->s_nilfs, sbi);
+	return err;
+
+ confused:
+	err = -EINVAL;
+ failed:
+	printk(KERN_ERR
+	       "NILFS (device %s): Error roll-forwarding "
+	       "(err=%d, pseg block=%llu). ",
+	       sbi->s_super->s_id, err, (unsigned long long)pseg_start);
+	goto out;
+}
+
+static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
+				      struct nilfs_sb_info *sbi,
+				      struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh;
+	int err;
+
+	if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
+	    nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
+		return;
+
+	bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
+	BUG_ON(!bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_dirty(bh);
+	err = sync_dirty_buffer(bh);
+	if (unlikely(err))
+		printk(KERN_WARNING
+		       "NILFS warning: buffer sync write failed during "
+		       "post-cleaning of recovery.\n");
+	brelse(bh);
+}
+
+/**
+ * nilfs_recover_logical_segments - salvage logical segments written after
+ * the latest super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - Inconsistent filesystem state.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
+				   struct nilfs_sb_info *sbi,
+				   struct nilfs_recovery_info *ri)
+{
+	int err;
+
+	if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
+		return 0;
+
+	err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: error loading the latest checkpoint.\n");
+		return err;
+	}
+
+	err = nilfs_do_roll_forward(nilfs, sbi, ri);
+	if (unlikely(err))
+		goto failed;
+
+	if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
+		err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri);
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Error preparing segments for "
+			       "recovery.\n");
+			goto failed;
+		}
+
+		err = nilfs_attach_segment_constructor(sbi);
+		if (unlikely(err))
+			goto failed;
+
+		set_nilfs_discontinued(nilfs);
+		err = nilfs_construct_segment(sbi->s_super);
+		nilfs_detach_segment_constructor(sbi);
+
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Oops! recovery failed. "
+			       "(err=%d)\n", err);
+			goto failed;
+		}
+
+		nilfs_finish_roll_forward(nilfs, sbi, ri);
+	}
+
+	nilfs_detach_checkpoint(sbi);
+	return 0;
+
+ failed:
+	nilfs_detach_checkpoint(sbi);
+	nilfs_mdt_clear(nilfs->ns_cpfile);
+	nilfs_mdt_clear(nilfs->ns_sufile);
+	nilfs_mdt_clear(nilfs->ns_dat);
+	return err;
+}
+
+/**
+ * nilfs_search_super_root - search the latest valid super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * nilfs_search_super_root() looks for the latest super-root from a partial
+ * segment pointed by the superblock.  It sets up struct the_nilfs through
+ * this search. It fills nilfs_recovery_info (ri) required for recovery.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - No valid segment found
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
+			    struct nilfs_recovery_info *ri)
+{
+	struct nilfs_segsum_info ssi;
+	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
+	sector_t seg_start, seg_end; /* range of full segment (block number) */
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	__u64 cno;
+	struct nilfs_segment_entry *ent;
+	LIST_HEAD(segments);
+	int empty_seg = 0, scan_newer = 0;
+	int ret;
+
+	pseg_start = nilfs->ns_last_pseg;
+	seg_seq = nilfs->ns_last_seq;
+	cno = nilfs->ns_last_cno;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+
+	/* Calculate range of segment */
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	for (;;) {
+		/* Load segment summary */
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO)
+				goto failed;
+			goto strayed;
+		}
+		pseg_end = pseg_start + ssi.nblocks - 1;
+		if (unlikely(pseg_end > seg_end)) {
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto strayed;
+		}
+
+		/* A valid partial segment */
+		ri->ri_pseg_start = pseg_start;
+		ri->ri_seq = seg_seq;
+		ri->ri_segnum = segnum;
+		nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+		ri->ri_nextnum = nextnum;
+		empty_seg = 0;
+
+		if (!NILFS_SEG_HAS_SR(&ssi)) {
+			if (!scan_newer) {
+				/* This will never happen because a superblock
+				   (last_segment) always points to a pseg
+				   having a super root. */
+				ret = NILFS_SEG_FAIL_CONSISTENCY;
+				goto failed;
+			}
+			if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
+				ri->ri_lsegs_start = pseg_start;
+				ri->ri_lsegs_start_seq = seg_seq;
+			}
+			if (NILFS_SEG_LOGEND(&ssi))
+				ri->ri_lsegs_end = pseg_start;
+			goto try_next_pseg;
+		}
+
+		/* A valid super root was found. */
+		ri->ri_cno = cno++;
+		ri->ri_super_root = pseg_end;
+		ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
+
+		nilfs_dispose_segment_list(&segments);
+		nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
+			+ ssi.nblocks - seg_start;
+		nilfs->ns_seg_seq = seg_seq;
+		nilfs->ns_segnum = segnum;
+		nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
+		nilfs->ns_ctime = ssi.ctime;
+		nilfs->ns_nextnum = nextnum;
+
+		if (scan_newer)
+			ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
+		else {
+			if (nilfs->ns_mount_state & NILFS_VALID_FS)
+				goto super_root_found;
+			scan_newer = 1;
+		}
+
+		/* reset region for roll-forward */
+		pseg_start += ssi.nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ try_next_pseg:
+		/* Standing on a course, or met an inconsistent state */
+		pseg_start += ssi.nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		/* Off the trail */
+		if (!scan_newer)
+			/*
+			 * This can happen if a checkpoint was written without
+			 * barriers, or as a result of an I/O failure.
+			 */
+			goto failed;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			goto super_root_found; /* found a valid super root */
+
+		ent = nilfs_alloc_segment_entry(segnum);
+		if (unlikely(!ent)) {
+			ret = -ENOMEM;
+			goto failed;
+		}
+		list_add_tail(&ent->list, &segments);
+
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+ super_root_found:
+	/* Updating pointers relating to the latest checkpoint */
+	list_splice(&segments, ri->ri_used_segments.prev);
+	nilfs->ns_last_pseg = sr_pseg_start;
+	nilfs->ns_last_seq = nilfs->ns_seg_seq;
+	nilfs->ns_last_cno = ri->ri_cno;
+	return 0;
+
+ failed:
+	nilfs_dispose_segment_list(&segments);
+	return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 000000000000..adccd4fc654e
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
+/*
+ * sb.h - NILFS on-memory super block structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _NILFS_SB
+#define _NILFS_SB
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+/*
+ * Mount options
+ */
+struct nilfs_mount_options {
+	unsigned long mount_opt;
+	__u64 snapshot_cno;
+};
+
+struct the_nilfs;
+struct nilfs_sc_info;
+
+/*
+ * NILFS super-block data in memory
+ */
+struct nilfs_sb_info {
+	/* Snapshot status */
+	__u64 s_snapshot_cno;		/* Checkpoint number */
+	atomic_t s_inodes_count;
+	atomic_t s_blocks_count;	/* Reserved (might be deleted) */
+
+	/* Mount options */
+	unsigned long s_mount_opt;
+	uid_t s_resuid;
+	gid_t s_resgid;
+
+	unsigned long s_interval;	/* construction interval */
+	unsigned long s_watermark;	/* threshold of data amount
+					   for the segment construction */
+
+	/* Fundamental members */
+	struct super_block *s_super;	/* reverse pointer to super_block */
+	struct the_nilfs *s_nilfs;
+	struct list_head s_list;	/* list head for nilfs->ns_supers */
+
+	/* Segment constructor */
+	struct list_head s_dirty_files;	/* dirty files list */
+	struct nilfs_sc_info *s_sc_info; /* segment constructor info */
+	spinlock_t s_inode_lock;	/* Lock for the nilfs inode.
+					   It covers s_dirty_files list */
+
+	/* Metadata files */
+	struct inode *s_ifile;		/* index file inode */
+
+	/* Inode allocator */
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+};
+
+static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
+{
+	return sbi->s_sc_info;
+}
+
+/*
+ * Bit operations for the mount option
+ */
+#define nilfs_clear_opt(sbi, opt)  \
+	do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(sbi, opt)  \
+	do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(sbi, opt)   ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(sbi, mask, opt)					\
+	do { (sbi)->s_mount_opt =					\
+		(((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) |		\
+		 NILFS_MOUNT_##opt);					\
+	} while (0)
+
+#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000000..1e68821b4a9b
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
+/*
+ * segbuf.c - NILFS segment buffer
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/crc32.h>
+#include "page.h"
+#include "segbuf.h"
+#include "seglist.h"
+
+
+static struct kmem_cache *nilfs_segbuf_cachep;
+
+static void nilfs_segbuf_init_once(void *obj)
+{
+	memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
+
+int __init nilfs_init_segbuf_cache(void)
+{
+	nilfs_segbuf_cachep =
+		kmem_cache_create("nilfs2_segbuf_cache",
+				  sizeof(struct nilfs_segment_buffer),
+				  0, SLAB_RECLAIM_ACCOUNT,
+				  nilfs_segbuf_init_once);
+
+	return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
+}
+
+void nilfs_destroy_segbuf_cache(void)
+{
+	kmem_cache_destroy(nilfs_segbuf_cachep);
+}
+
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
+	if (unlikely(!segbuf))
+		return NULL;
+
+	segbuf->sb_super = sb;
+	INIT_LIST_HEAD(&segbuf->sb_list);
+	INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
+	INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+	return segbuf;
+}
+
+void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
+{
+	kmem_cache_free(nilfs_segbuf_cachep, segbuf);
+}
+
+void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
+		     unsigned long offset, struct the_nilfs *nilfs)
+{
+	segbuf->sb_segnum = segnum;
+	nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
+				&segbuf->sb_fseg_end);
+
+	segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
+				  __u64 nextnum, struct the_nilfs *nilfs)
+{
+	segbuf->sb_nextnum = nextnum;
+	segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
+}
+
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_segsum_buffer(segbuf, bh);
+	return 0;
+}
+
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	*bhp = bh;
+	return 0;
+}
+
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+		       time_t ctime)
+{
+	int err;
+
+	segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
+	err = nilfs_segbuf_extend_segsum(segbuf);
+	if (unlikely(err))
+		return err;
+
+	segbuf->sb_sum.flags = flags;
+	segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
+	segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
+	segbuf->sb_sum.ctime = ctime;
+
+	segbuf->sb_io_error = 0;
+	return 0;
+}
+
+/*
+ * Setup segument summary
+ */
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct nilfs_segment_summary *raw_sum;
+	struct buffer_head *bh_sum;
+
+	bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
+			    struct buffer_head, b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+
+	raw_sum->ss_magic    = cpu_to_le32(NILFS_SEGSUM_MAGIC);
+	raw_sum->ss_bytes    = cpu_to_le16(sizeof(*raw_sum));
+	raw_sum->ss_flags    = cpu_to_le16(segbuf->sb_sum.flags);
+	raw_sum->ss_seq      = cpu_to_le64(segbuf->sb_sum.seg_seq);
+	raw_sum->ss_create   = cpu_to_le64(segbuf->sb_sum.ctime);
+	raw_sum->ss_next     = cpu_to_le64(segbuf->sb_sum.next);
+	raw_sum->ss_nblocks  = cpu_to_le32(segbuf->sb_sum.nblocks);
+	raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
+	raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
+	raw_sum->ss_pad      = 0;
+}
+
+/*
+ * CRC calculation routines
+ */
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+				     u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	unsigned long size, bytes = segbuf->sb_sum.sumbytes;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	size = min_t(unsigned long, bytes, bh->b_size);
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum +
+		       sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
+		       size - (sizeof(raw_sum->ss_datasum) +
+			       sizeof(raw_sum->ss_sumsum)));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		bytes -= size;
+		size = min_t(unsigned long, bytes, bh->b_size);
+		crc = crc32_le(crc, bh->b_data, size);
+	}
+	raw_sum->ss_sumsum = cpu_to_le32(crc);
+}
+
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+				   u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	void *kaddr;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
+		       bh->b_size - sizeof(raw_sum->ss_datasum));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		crc = crc32_le(crc, bh->b_data, bh->b_size);
+	}
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	raw_sum->ss_datasum = cpu_to_le32(crc);
+}
+
+void nilfs_release_buffers(struct list_head *list)
+{
+	struct buffer_head *bh, *n;
+
+	list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		if (buffer_nilfs_allocated(bh)) {
+			struct page *clone_page = bh->b_page;
+
+			/* remove clone page */
+			brelse(bh);
+			page_cache_release(clone_page); /* for each bh */
+			if (page_count(clone_page) <= 2) {
+				lock_page(clone_page);
+				nilfs_free_private_page(clone_page);
+			}
+			continue;
+		}
+		brelse(bh);
+	}
+}
+
+/*
+ * BIO operations
+ */
+static void nilfs_end_bio_write(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct nilfs_write_info *wi = bio->bi_private;
+
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		bio_put(bio);
+		/* to be detected by submit_seg_bio() */
+	}
+
+	if (!uptodate)
+		atomic_inc(&wi->err);
+
+	bio_put(bio);
+	complete(&wi->bio_event);
+}
+
+static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+{
+	struct bio *bio = wi->bio;
+	int err;
+
+	if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
+		wait_for_completion(&wi->bio_event);
+		wi->nbio--;
+		if (unlikely(atomic_read(&wi->err))) {
+			bio_put(bio);
+			err = -EIO;
+			goto failed;
+		}
+	}
+
+	bio->bi_end_io = nilfs_end_bio_write;
+	bio->bi_private = wi;
+	bio_get(bio);
+	submit_bio(mode, bio);
+	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+		bio_put(bio);
+		err = -EOPNOTSUPP;
+		goto failed;
+	}
+	wi->nbio++;
+	bio_put(bio);
+
+	wi->bio = NULL;
+	wi->rest_blocks -= wi->end - wi->start;
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end;
+	return 0;
+
+ failed:
+	wi->bio = NULL;
+	return err;
+}
+
+/**
+ * nilfs_alloc_seg_bio - allocate a bio for writing segment.
+ * @sb: super block
+ * @start: beginning disk block number of this BIO.
+ * @nr_vecs: request size of page vector.
+ *
+ * alloc_seg_bio() allocates a new BIO structure and initialize it.
+ *
+ * Return Value: On success, pointer to the struct bio is returned.
+ * On error, NULL is returned.
+ */
+static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+				       int nr_vecs)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+	if (bio == NULL) {
+		while (!bio && (nr_vecs >>= 1))
+			bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+	}
+	if (likely(bio)) {
+		bio->bi_bdev = sb->s_bdev;
+		bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+	}
+	return bio;
+}
+
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+				struct nilfs_write_info *wi)
+{
+	wi->bio = NULL;
+	wi->rest_blocks = segbuf->sb_sum.nblocks;
+	wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end = 0;
+	wi->nbio = 0;
+	wi->blocknr = segbuf->sb_pseg_start;
+
+	atomic_set(&wi->err, 0);
+	init_completion(&wi->bio_event);
+}
+
+static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
+			   int mode)
+{
+	int len, err;
+
+	BUG_ON(wi->nr_vecs <= 0);
+ repeat:
+	if (!wi->bio) {
+		wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+					      wi->nr_vecs);
+		if (unlikely(!wi->bio))
+			return -ENOMEM;
+	}
+
+	len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (len == bh->b_size) {
+		wi->end++;
+		return 0;
+	}
+	/* bio is FULL */
+	err = nilfs_submit_seg_bio(wi, mode);
+	/* never submit current bh */
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+		       struct nilfs_write_info *wi)
+{
+	struct buffer_head *bh;
+	int res, rw = WRITE;
+
+	list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
+		res = nilfs_submit_bh(wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		res = nilfs_submit_bh(wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	if (wi->bio) {
+		/*
+		 * Last BIO is always sent through the following
+		 * submission.
+		 */
+		rw |= (1 << BIO_RW_SYNCIO);
+		res = nilfs_submit_seg_bio(wi, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	res = 0;
+ out:
+	return res;
+
+ failed_bio:
+	atomic_inc(&wi->err);
+	goto out;
+}
+
+/**
+ * nilfs_segbuf_wait - wait for completion of requested BIOs
+ * @wi: nilfs_write_info
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
+		      struct nilfs_write_info *wi)
+{
+	int err = 0;
+
+	if (!wi->nbio)
+		return 0;
+
+	do {
+		wait_for_completion(&wi->bio_event);
+	} while (--wi->nbio > 0);
+
+	if (unlikely(atomic_read(&wi->err) > 0)) {
+		printk(KERN_ERR "NILFS: IO error writing segment\n");
+		err = -EIO;
+		segbuf->sb_io_error = 1;
+	}
+	return err;
+}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000000..0c3076f4e592
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
+/*
+ * segbuf.h - NILFS Segment buffer prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGBUF_H
+#define _NILFS_SEGBUF_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+
+/**
+ * struct nilfs_segsum_info - On-memory segment summary
+ * @flags: Flags
+ * @nfinfo: Number of file information structures
+ * @nblocks: Number of blocks included in the partial segment
+ * @nsumblk: Number of summary blocks
+ * @sumbytes: Byte count of segment summary
+ * @nfileblk: Total number of file blocks
+ * @seg_seq: Segment sequence number
+ * @ctime: Creation time
+ * @next: Block number of the next full segment
+ */
+struct nilfs_segsum_info {
+	unsigned int		flags;
+	unsigned long		nfinfo;
+	unsigned long		nblocks;
+	unsigned long		nsumblk;
+	unsigned long		sumbytes;
+	unsigned long		nfileblk;
+	u64			seg_seq;
+	time_t			ctime;
+	sector_t		next;
+};
+
+/* macro for the flags */
+#define NILFS_SEG_HAS_SR(sum)    ((sum)->flags & NILFS_SS_SR)
+#define NILFS_SEG_LOGBGN(sum)    ((sum)->flags & NILFS_SS_LOGBGN)
+#define NILFS_SEG_LOGEND(sum)    ((sum)->flags & NILFS_SS_LOGEND)
+#define NILFS_SEG_DSYNC(sum)     ((sum)->flags & NILFS_SS_SYNDT)
+#define NILFS_SEG_SIMPLEX(sum) \
+	(((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
+	 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
+
+#define NILFS_SEG_EMPTY(sum)	((sum)->nblocks == (sum)->nsumblk)
+
+/**
+ * struct nilfs_segment_buffer - Segment buffer
+ * @sb_super: back pointer to a superblock struct
+ * @sb_list: List head to chain this structure
+ * @sb_sum: On-memory segment summary
+ * @sb_segnum: Index number of the full segment
+ * @sb_nextnum: Index number of the next full segment
+ * @sb_fseg_start: Start block number of the full segment
+ * @sb_fseg_end: End block number of the full segment
+ * @sb_pseg_start: Disk block number of partial segment
+ * @sb_rest_blocks: Number of residual blocks in the current segment
+ * @sb_segsum_buffers: List of buffers for segment summaries
+ * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_io_error: I/O error status
+ */
+struct nilfs_segment_buffer {
+	struct super_block     *sb_super;
+	struct list_head	sb_list;
+
+	/* Segment information */
+	struct nilfs_segsum_info sb_sum;
+	__u64			sb_segnum;
+	__u64			sb_nextnum;
+	sector_t		sb_fseg_start, sb_fseg_end;
+	sector_t		sb_pseg_start;
+	unsigned		sb_rest_blocks;
+
+	/* Buffers */
+	struct list_head	sb_segsum_buffers;
+	struct list_head	sb_payload_buffers; /* including super root */
+
+	/* io status */
+	int			sb_io_error;
+};
+
+#define NILFS_LIST_SEGBUF(head)  \
+	list_entry((head), struct nilfs_segment_buffer, sb_list)
+#define NILFS_NEXT_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
+#define NILFS_PREV_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
+#define NILFS_LAST_SEGBUF(head)    NILFS_LIST_SEGBUF((head)->prev)
+#define NILFS_FIRST_SEGBUF(head)   NILFS_LIST_SEGBUF((head)->next)
+#define NILFS_SEGBUF_IS_LAST(segbuf, head)  ((segbuf)->sb_list.next == (head))
+
+#define nilfs_for_each_segbuf_before(s, t, h) \
+	for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
+	     (s) = NILFS_NEXT_SEGBUF(s))
+
+#define NILFS_SEGBUF_FIRST_BH(head)  \
+	(list_entry((head)->next, struct buffer_head, b_assoc_buffers))
+#define NILFS_SEGBUF_NEXT_BH(bh)  \
+	(list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
+		    b_assoc_buffers))
+#define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+
+
+int __init nilfs_init_segbuf_cache(void);
+void nilfs_destroy_segbuf_cache(void);
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
+void nilfs_segbuf_free(struct nilfs_segment_buffer *);
+void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
+		      struct the_nilfs *);
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
+				  struct the_nilfs *);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
+				struct buffer_head **);
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
+
+static inline void
+nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
+			       struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
+	segbuf->sb_sum.nblocks++;
+	segbuf->sb_sum.nsumblk++;
+}
+
+static inline void
+nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
+	segbuf->sb_sum.nblocks++;
+}
+
+static inline void
+nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
+			     struct buffer_head *bh)
+{
+	get_bh(bh);
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	segbuf->sb_sum.nfileblk++;
+}
+
+void nilfs_release_buffers(struct list_head *);
+
+static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+
+struct nilfs_write_info {
+	struct bio	       *bio;
+	int 			start, end; /* The region to be submitted */
+	int			rest_blocks;
+	int			max_pages;
+	int			nr_vecs;
+	sector_t		blocknr;
+
+	int			nbio;
+	atomic_t		err;
+	struct completion	bio_event;
+				/* completion event of segment write */
+
+	/*
+	 * The following fields must be set explicitly
+	 */
+	struct super_block     *sb;
+	struct backing_dev_info *bdi; /* backing dev info */
+	struct buffer_head     *bh_sr;
+};
+
+
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
+				struct nilfs_write_info *);
+int nilfs_segbuf_write(struct nilfs_segment_buffer *,
+		       struct nilfs_write_info *);
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
+		      struct nilfs_write_info *);
+
+#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 000000000000..d39df9144e99
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
+/*
+ * seglist.h - expediential structure and routines to handle list of segments
+ *             (would be removed in a future release)
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGLIST_H
+#define _NILFS_SEGLIST_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sufile.h"
+
+struct nilfs_segment_entry {
+	__u64			segnum;
+
+#define NILFS_SLH_FREED		0x0001	/* The segment was freed provisonally.
+					   It must be cancelled if
+					   construction aborted */
+
+	unsigned		flags;
+	struct list_head	list;
+	struct buffer_head     *bh_su;
+	struct nilfs_segment_usage *raw_su;
+};
+
+
+void nilfs_dispose_segment_list(struct list_head *);
+
+static inline struct nilfs_segment_entry *
+nilfs_alloc_segment_entry(__u64 segnum)
+{
+	struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+
+	if (likely(ent)) {
+		ent->segnum = segnum;
+		ent->flags = 0;
+		ent->bh_su = NULL;
+		ent->raw_su = NULL;
+		INIT_LIST_HEAD(&ent->list);
+	}
+	return ent;
+}
+
+static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
+					   struct inode *sufile)
+{
+	return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
+					      &ent->raw_su, &ent->bh_su);
+}
+
+static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
+					     struct inode *sufile)
+{
+	if (!ent->bh_su)
+		return;
+	nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
+	ent->bh_su = NULL;
+	ent->raw_su = NULL;
+}
+
+static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
+{
+	kfree(ent);
+}
+
+#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 000000000000..22c7f65c2403
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2978 @@
+/*
+ * segment.c - NILFS segment constructor.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "page.h"
+#include "segment.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "seglist.h"
+#include "segbuf.h"
+
+
+/*
+ * Segment constructor
+ */
+#define SC_N_INODEVEC	16   /* Size of locally allocated inode vector */
+
+#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
+				appended in collection retry loop */
+
+/* Construction mode */
+enum {
+	SC_LSEG_SR = 1,	/* Make a logical segment having a super root */
+	SC_LSEG_DSYNC,	/* Flush data blocks of a given file and make
+			   a logical segment without a super root */
+	SC_FLUSH_FILE,	/* Flush data files, leads to segment writes without
+			   creating a checkpoint */
+	SC_FLUSH_DAT,	/* Flush DAT file. This also creates segments without
+			   a checkpoint */
+};
+
+/* Stage numbers of dirty block collection */
+enum {
+	NILFS_ST_INIT = 0,
+	NILFS_ST_GC,		/* Collecting dirty blocks for GC */
+	NILFS_ST_FILE,
+	NILFS_ST_IFILE,
+	NILFS_ST_CPFILE,
+	NILFS_ST_SUFILE,
+	NILFS_ST_DAT,
+	NILFS_ST_SR,		/* Super root */
+	NILFS_ST_DSYNC,		/* Data sync blocks */
+	NILFS_ST_DONE,
+};
+
+/* State flags of collection */
+#define NILFS_CF_NODE		0x0001	/* Collecting node blocks */
+#define NILFS_CF_IFILE_STARTED	0x0002	/* IFILE stage has started */
+#define NILFS_CF_HISTORY_MASK	(NILFS_CF_IFILE_STARTED)
+
+/* Operations depending on the construction mode and file type */
+struct nilfs_sc_operations {
+	int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	void (*write_data_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+	void (*write_node_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+};
+
+/*
+ * Other definitions
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
+static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
+			       int);
+
+#define nilfs_cnt32_gt(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(b) - (__s32)(a) < 0))
+#define nilfs_cnt32_ge(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(a) - (__s32)(b) >= 0))
+#define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
+#define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
+
+/*
+ * Transaction
+ */
+static struct kmem_cache *nilfs_transaction_cachep;
+
+/**
+ * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
+ *
+ * nilfs_init_transaction_cache() creates a slab cache for the struct
+ * nilfs_transaction_info.
+ *
+ * Return Value: On success, it returns 0. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_init_transaction_cache(void)
+{
+	nilfs_transaction_cachep =
+		kmem_cache_create("nilfs2_transaction_cache",
+				  sizeof(struct nilfs_transaction_info),
+				  0, SLAB_RECLAIM_ACCOUNT, NULL);
+	return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
+}
+
+/**
+ * nilfs_detroy_transaction_cache - destroy the cache for transaction info
+ *
+ * nilfs_destroy_transaction_cache() frees the slab cache for the struct
+ * nilfs_transaction_info.
+ */
+void nilfs_destroy_transaction_cache(void)
+{
+	kmem_cache_destroy(nilfs_transaction_cachep);
+}
+
+static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+	void *save = NULL;
+
+	if (cur_ti) {
+		if (cur_ti->ti_magic == NILFS_TI_MAGIC)
+			return ++cur_ti->ti_count;
+		else {
+			/*
+			 * If journal_info field is occupied by other FS,
+			 * it is saved and will be restored on
+			 * nilfs_transaction_commit().
+			 */
+			printk(KERN_WARNING
+			       "NILFS warning: journal info from a different "
+			       "FS\n");
+			save = current->journal_info;
+		}
+	}
+	if (!ti) {
+		ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
+		if (!ti)
+			return -ENOMEM;
+		ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
+	} else {
+		ti->ti_flags = 0;
+	}
+	ti->ti_count = 0;
+	ti->ti_save = save;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	current->journal_info = ti;
+	return 0;
+}
+
+/**
+ * nilfs_transaction_begin - start indivisible file operations.
+ * @sb: super block
+ * @ti: nilfs_transaction_info
+ * @vacancy_check: flags for vacancy rate checks
+ *
+ * nilfs_transaction_begin() acquires a reader/writer semaphore, called
+ * the segment semaphore, to make a segment construction and write tasks
+ * exclusive.  The function is used with nilfs_transaction_commit() in pairs.
+ * The region enclosed by these two functions can be nested.  To avoid a
+ * deadlock, the semaphore is only acquired or released in the outermost call.
+ *
+ * This function allocates a nilfs_transaction_info struct to keep context
+ * information on it.  It is initialized and hooked onto the current task in
+ * the outermost call.  If a pre-allocated struct is given to @ti, it is used
+ * instead; othewise a new struct is assigned from a slab.
+ *
+ * When @vacancy_check flag is set, this function will check the amount of
+ * free space, and will wait for the GC to reclaim disk space if low capacity.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-ENOSPC - No space left on device
+ */
+int nilfs_transaction_begin(struct super_block *sb,
+			    struct nilfs_transaction_info *ti,
+			    int vacancy_check)
+{
+	struct nilfs_sb_info *sbi;
+	struct the_nilfs *nilfs;
+	int ret = nilfs_prepare_segment_lock(ti);
+
+	if (unlikely(ret < 0))
+		return ret;
+	if (ret > 0)
+		return 0;
+
+	sbi = NILFS_SB(sb);
+	nilfs = sbi->s_nilfs;
+	down_read(&nilfs->ns_segctor_sem);
+	if (vacancy_check && nilfs_near_disk_full(nilfs)) {
+		up_read(&nilfs->ns_segctor_sem);
+		ret = -ENOSPC;
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	ti = current->journal_info;
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	return ret;
+}
+
+/**
+ * nilfs_transaction_commit - commit indivisible file operations.
+ * @sb: super block
+ *
+ * nilfs_transaction_commit() releases the read semaphore which is
+ * acquired by nilfs_transaction_begin(). This is only performed
+ * in outermost call of this function.  If a commit flag is set,
+ * nilfs_transaction_commit() sets a timer to start the segment
+ * constructor.  If a sync flag is set, it starts construction
+ * directly.
+ */
+int nilfs_transaction_commit(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct nilfs_sb_info *sbi;
+	struct nilfs_sc_info *sci;
+	int err = 0;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	ti->ti_flags |= NILFS_TI_COMMIT;
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return 0;
+	}
+	sbi = NILFS_SB(sb);
+	sci = NILFS_SC(sbi);
+	if (sci != NULL) {
+		if (ti->ti_flags & NILFS_TI_COMMIT)
+			nilfs_segctor_start_timer(sci);
+		if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
+		    sci->sc_watermark)
+			nilfs_segctor_do_flush(sci, 0);
+	}
+	up_read(&sbi->s_nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+
+	if (ti->ti_flags & NILFS_TI_SYNC)
+		err = nilfs_construct_segment(sb);
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	return err;
+}
+
+void nilfs_transaction_abort(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return;
+	}
+	up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
+
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+}
+
+void nilfs_relax_pressure_in_lock(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	if (!sci || !sci->sc_flush_request)
+		return;
+
+	set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+	up_read(&nilfs->ns_segctor_sem);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (sci->sc_flush_request &&
+	    test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
+		struct nilfs_transaction_info *ti = current->journal_info;
+
+		ti->ti_flags |= NILFS_TI_WRITER;
+		nilfs_segctor_do_immediate_flush(sci);
+		ti->ti_flags &= ~NILFS_TI_WRITER;
+	}
+	downgrade_write(&nilfs->ns_segctor_sem);
+}
+
+static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
+				   struct nilfs_transaction_info *ti,
+				   int gcflag)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+
+	WARN_ON(cur_ti);
+	ti->ti_flags = NILFS_TI_WRITER;
+	ti->ti_count = 0;
+	ti->ti_save = cur_ti;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	INIT_LIST_HEAD(&ti->ti_garbage);
+	current->journal_info = ti;
+
+	for (;;) {
+		down_write(&sbi->s_nilfs->ns_segctor_sem);
+		if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
+			break;
+
+		nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
+
+		up_write(&sbi->s_nilfs->ns_segctor_sem);
+		yield();
+	}
+	if (gcflag)
+		ti->ti_flags |= NILFS_TI_GC;
+}
+
+static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	BUG_ON(ti->ti_count > 0);
+
+	up_write(&sbi->s_nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+	if (!list_empty(&ti->ti_garbage))
+		nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
+}
+
+static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
+					    struct nilfs_segsum_pointer *ssp,
+					    unsigned bytes)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	void *p;
+
+	if (unlikely(ssp->offset + bytes > blocksize)) {
+		ssp->offset = 0;
+		BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
+					       &segbuf->sb_segsum_buffers));
+		ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
+	}
+	p = ssp->bh->b_data + ssp->offset;
+	ssp->offset += bytes;
+	return p;
+}
+
+/**
+ * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
+ * @sci: nilfs_sc_info
+ */
+static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	struct buffer_head *sumbh;
+	unsigned sumbytes;
+	unsigned flags = 0;
+	int err;
+
+	if (nilfs_doing_gc())
+		flags = NILFS_SS_GC;
+	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+	if (unlikely(err))
+		return err;
+
+	sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	sumbytes = segbuf->sb_sum.sumbytes;
+	sci->sc_finfo_ptr.bh = sumbh;  sci->sc_finfo_ptr.offset = sumbytes;
+	sci->sc_binfo_ptr.bh = sumbh;  sci->sc_binfo_ptr.offset = sumbytes;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+	return 0;
+}
+
+static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
+{
+	sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+	if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
+		return -E2BIG; /* The current segment is filled up
+				  (internal code) */
+	sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
+	return nilfs_segctor_reset_segment_buffer(sci);
+}
+
+static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	int err;
+
+	if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		segbuf = sci->sc_curseg;
+	}
+	err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+	if (likely(!err))
+		segbuf->sb_sum.flags |= NILFS_SS_SR;
+	return err;
+}
+
+/*
+ * Functions for making segment summary and payloads
+ */
+static int nilfs_segctor_segsum_block_required(
+	struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
+	unsigned binfo_size)
+{
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	/* Size of finfo and binfo is enough small against blocksize */
+
+	return ssp->offset + binfo_size +
+		(!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
+		blocksize;
+}
+
+static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
+				      struct inode *inode)
+{
+	sci->sc_curseg->sb_sum.nfinfo++;
+	sci->sc_binfo_ptr = sci->sc_finfo_ptr;
+	nilfs_segctor_map_segsum_entry(
+		sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
+
+	if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+	/* skip finfo */
+}
+
+static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
+				    struct inode *inode)
+{
+	struct nilfs_finfo *finfo;
+	struct nilfs_inode_info *ii;
+	struct nilfs_segment_buffer *segbuf;
+
+	if (sci->sc_blk_cnt == 0)
+		return;
+
+	ii = NILFS_I(inode);
+	finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
+						 sizeof(*finfo));
+	finfo->fi_ino = cpu_to_le64(inode->i_ino);
+	finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
+	finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
+	finfo->fi_cno = cpu_to_le64(ii->i_cno);
+
+	segbuf = sci->sc_curseg;
+	segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
+		sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
+	sci->sc_finfo_ptr = sci->sc_binfo_ptr;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+}
+
+static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
+					struct buffer_head *bh,
+					struct inode *inode,
+					unsigned binfo_size)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int required, err = 0;
+
+ retry:
+	segbuf = sci->sc_curseg;
+	required = nilfs_segctor_segsum_block_required(
+		sci, &sci->sc_binfo_ptr, binfo_size);
+	if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
+		nilfs_segctor_end_finfo(sci, inode);
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		goto retry;
+	}
+	if (unlikely(required)) {
+		err = nilfs_segbuf_extend_segsum(segbuf);
+		if (unlikely(err))
+			goto failed;
+	}
+	if (sci->sc_blk_cnt == 0)
+		nilfs_segctor_begin_finfo(sci, inode);
+
+	nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
+	/* Substitution to vblocknr is delayed until update_blocknr() */
+	nilfs_segbuf_add_file_buffer(segbuf, bh);
+	sci->sc_blk_cnt++;
+ failed:
+	return err;
+}
+
+static int nilfs_handle_bmap_error(int err, const char *fname,
+				   struct inode *inode, struct super_block *sb)
+{
+	if (err == -EINVAL) {
+		nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
+			    inode->i_ino);
+		err = -EIO;
+	}
+	return err;
+}
+
+/*
+ * Callback functions that enumerate, mark, and collect dirty blocks
+ */
+static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (unlikely(err < 0))
+		return nilfs_handle_bmap_error(err, __func__, inode,
+					       sci->sc_super);
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode,
+					   sizeof(struct nilfs_binfo_v));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (unlikely(err < 0))
+		return nilfs_handle_bmap_error(err, __func__, inode,
+					       sci->sc_super);
+	return 0;
+}
+
+static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+}
+
+static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*binfo_v));
+	*binfo_v = binfo->bi_v;
+}
+
+static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	__le64 *vblocknr = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*vblocknr));
+	*vblocknr = binfo->bi_v.bi_vblocknr;
+}
+
+struct nilfs_sc_operations nilfs_sc_file_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_file_bmap,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = nilfs_write_file_node_binfo,
+};
+
+static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (unlikely(err < 0))
+		return nilfs_handle_bmap_error(err, __func__, inode,
+					       sci->sc_super);
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode,
+					    sizeof(struct nilfs_binfo_dat));
+}
+
+static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	__le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
+							  sizeof(*blkoff));
+	*blkoff = binfo->bi_dat.bi_blkoff;
+}
+
+static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_dat *binfo_dat =
+		nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
+	*binfo_dat = binfo->bi_dat;
+}
+
+struct nilfs_sc_operations nilfs_sc_dat_ops = {
+	.collect_data = nilfs_collect_dat_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_dat_bmap,
+	.write_data_binfo = nilfs_write_dat_data_binfo,
+	.write_node_binfo = nilfs_write_dat_node_binfo,
+};
+
+struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = NULL,
+	.collect_bmap = NULL,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = NULL,
+};
+
+static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
+					      struct list_head *listp,
+					      size_t nlimit,
+					      loff_t start, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t index = 0, last = ULONG_MAX;
+	size_t ndirties = 0;
+	int i;
+
+	if (unlikely(start != 0 || end != LLONG_MAX)) {
+		/*
+		 * A valid range is given for sync-ing data pages. The
+		 * range is rounded to per-page; extra dirty buffers
+		 * may be included if blocksize < pagesize.
+		 */
+		index = start >> PAGE_SHIFT;
+		last = end >> PAGE_SHIFT;
+	}
+	pagevec_init(&pvec, 0);
+ repeat:
+	if (unlikely(index > last) ||
+	    !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				min_t(pgoff_t, last - index,
+				      PAGEVEC_SIZE - 1) + 1))
+		return ndirties;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct buffer_head *bh, *head;
+		struct page *page = pvec.pages[i];
+
+		if (unlikely(page->index > last))
+			break;
+
+		if (mapping->host) {
+			lock_page(page);
+			if (!page_has_buffers(page))
+				create_empty_buffers(page,
+						     1 << inode->i_blkbits, 0);
+			unlock_page(page);
+		}
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_dirty(bh))
+				continue;
+			get_bh(bh);
+			list_add_tail(&bh->b_assoc_buffers, listp);
+			ndirties++;
+			if (unlikely(ndirties >= nlimit)) {
+				pagevec_release(&pvec);
+				cond_resched();
+				return ndirties;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+	goto repeat;
+}
+
+static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
+					    struct list_head *listp)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct address_space *mapping = &ii->i_btnode_cache;
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh)) {
+					get_bh(bh);
+					list_add_tail(&bh->b_assoc_buffers,
+						      listp);
+				}
+				bh = bh->b_this_page;
+			} while (bh != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
+			       struct list_head *head, int force)
+{
+	struct nilfs_inode_info *ii, *n;
+	struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
+	unsigned nv = 0;
+
+	while (!list_empty(head)) {
+		spin_lock(&sbi->s_inode_lock);
+		list_for_each_entry_safe(ii, n, head, i_dirty) {
+			list_del_init(&ii->i_dirty);
+			if (force) {
+				if (unlikely(ii->i_bh)) {
+					brelse(ii->i_bh);
+					ii->i_bh = NULL;
+				}
+			} else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+				set_bit(NILFS_I_QUEUED, &ii->i_state);
+				list_add_tail(&ii->i_dirty,
+					      &sbi->s_dirty_files);
+				continue;
+			}
+			ivec[nv++] = ii;
+			if (nv == SC_N_INODEVEC)
+				break;
+		}
+		spin_unlock(&sbi->s_inode_lock);
+
+		for (pii = ivec; nv > 0; pii++, nv--)
+			iput(&(*pii)->vfs_inode);
+	}
+}
+
+static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int ret = 0;
+
+	if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
+		ret++;
+	if (ret || nilfs_doing_gc())
+		if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+			ret++;
+	return ret;
+}
+
+static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
+{
+	return list_empty(&sci->sc_dirty_files) &&
+		!test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
+		list_empty(&sci->sc_cleaning_segments) &&
+		(!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
+}
+
+static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	int ret = 0;
+
+	if (nilfs_test_metadata_dirty(sbi))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	spin_lock(&sbi->s_inode_lock);
+	if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
+		ret++;
+
+	spin_unlock(&sbi->s_inode_lock);
+	return ret;
+}
+
+static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	nilfs_mdt_clear_dirty(sbi->s_ifile);
+	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
+	nilfs_mdt_clear_dirty(nilfs->ns_sufile);
+	nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+}
+
+static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	/* XXX: this interface will be changed */
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
+					  &raw_cp, &bh_cp);
+	if (likely(!err)) {
+		/* The following code is duplicated with cpfile.  But, it is
+		   needed to collect the checkpoint even if it was not newly
+		   created */
+		nilfs_mdt_mark_buffer_dirty(bh_cp);
+		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
+		nilfs_cpfile_put_checkpoint(
+			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	} else
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+
+	return err;
+}
+
+static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
+					  &raw_cp, &bh_cp);
+	if (unlikely(err)) {
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+		goto failed_ibh;
+	}
+	raw_cp->cp_snapshot_list.ssl_next = 0;
+	raw_cp->cp_snapshot_list.ssl_prev = 0;
+	raw_cp->cp_inodes_count =
+		cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+	raw_cp->cp_blocks_count =
+		cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+	raw_cp->cp_nblk_inc =
+		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
+	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
+	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
+
+	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		nilfs_checkpoint_clear_minor(raw_cp);
+	else
+		nilfs_checkpoint_set_minor(raw_cp);
+
+	nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	return 0;
+
+ failed_ibh:
+	return err;
+}
+
+static void nilfs_fill_in_file_bmap(struct inode *ifile,
+				    struct nilfs_inode_info *ii)
+
+{
+	struct buffer_head *ibh;
+	struct nilfs_inode *raw_inode;
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
+		ibh = ii->i_bh;
+		BUG_ON(!ibh);
+		raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
+						  ibh);
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+		nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+	}
+}
+
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
+					    struct inode *ifile)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
+		nilfs_fill_in_file_bmap(ifile, ii);
+		set_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+/*
+ * CRC calculation routines
+ */
+static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
+{
+	struct nilfs_super_root *raw_sr =
+		(struct nilfs_super_root *)bh_sr->b_data;
+	u32 crc;
+
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+		       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+	raw_sr->sr_sum = cpu_to_le32(crc);
+}
+
+static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
+					    u32 seed)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	if (sci->sc_super_root)
+		nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+	}
+}
+
+static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct buffer_head *bh_sr = sci->sc_super_root;
+	struct nilfs_super_root *raw_sr =
+		(struct nilfs_super_root *)bh_sr->b_data;
+	unsigned isz = nilfs->ns_inode_size;
+
+	raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
+	raw_sr->sr_nongc_ctime
+		= cpu_to_le64(nilfs_doing_gc() ?
+			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
+	raw_sr->sr_flags = 0;
+
+	nilfs_mdt_write_inode_direct(
+		nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
+	nilfs_mdt_write_inode_direct(
+		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
+	nilfs_mdt_write_inode_direct(
+		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+}
+
+static void nilfs_redirty_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
+			clear_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+static void nilfs_drop_collected_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
+			continue;
+
+		clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+		set_bit(NILFS_I_UPDATED, &ii->i_state);
+	}
+}
+
+static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
+					       struct inode *sufile)
+
+{
+	struct list_head *head = &sci->sc_cleaning_segments;
+	struct nilfs_segment_entry *ent;
+	int err;
+
+	list_for_each_entry(ent, head, list) {
+		if (!(ent->flags & NILFS_SLH_FREED))
+			break;
+		err = nilfs_sufile_cancel_free(sufile, ent->segnum);
+		WARN_ON(err); /* do not happen */
+		ent->flags &= ~NILFS_SLH_FREED;
+	}
+}
+
+static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
+					       struct inode *sufile)
+{
+	struct list_head *head = &sci->sc_cleaning_segments;
+	struct nilfs_segment_entry *ent;
+	int err;
+
+	list_for_each_entry(ent, head, list) {
+		err = nilfs_sufile_free(sufile, ent->segnum);
+		if (unlikely(err))
+			return err;
+		ent->flags |= NILFS_SLH_FREED;
+	}
+	return 0;
+}
+
+static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
+{
+	nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+
+static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
+				       struct inode *inode,
+				       struct list_head *listp,
+				       int (*collect)(struct nilfs_sc_info *,
+						      struct buffer_head *,
+						      struct inode *))
+{
+	struct buffer_head *bh, *n;
+	int err = 0;
+
+	if (collect) {
+		list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
+			list_del_init(&bh->b_assoc_buffers);
+			err = collect(sci, bh, inode);
+			brelse(bh);
+			if (unlikely(err))
+				goto dispose_buffers;
+		}
+		return 0;
+	}
+
+ dispose_buffers:
+	while (!list_empty(listp)) {
+		bh = list_entry(listp->next, struct buffer_head,
+				b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return err;
+}
+
+static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
+{
+	/* Remaining number of blocks within segment buffer */
+	return sci->sc_segbuf_nblocks -
+		(sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
+}
+
+static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
+				   struct inode *inode,
+				   struct nilfs_sc_operations *sc_ops)
+{
+	LIST_HEAD(data_buffers);
+	LIST_HEAD(node_buffers);
+	int err;
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		size_t n, rest = nilfs_segctor_buffer_rest(sci);
+
+		n = nilfs_lookup_dirty_data_buffers(
+			inode, &data_buffers, rest + 1, 0, LLONG_MAX);
+		if (n > rest) {
+			err = nilfs_segctor_apply_buffers(
+				sci, inode, &data_buffers,
+				sc_ops->collect_data);
+			BUG_ON(!err); /* always receive -E2BIG or true error */
+			goto break_or_fail;
+		}
+	}
+	nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		err = nilfs_segctor_apply_buffers(
+			sci, inode, &data_buffers, sc_ops->collect_data);
+		if (unlikely(err)) {
+			/* dispose node list */
+			nilfs_segctor_apply_buffers(
+				sci, inode, &node_buffers, NULL);
+			goto break_or_fail;
+		}
+		sci->sc_stage.flags |= NILFS_CF_NODE;
+	}
+	/* Collect node */
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_node);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_bmap);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_segctor_end_finfo(sci, inode);
+	sci->sc_stage.flags &= ~NILFS_CF_NODE;
+
+ break_or_fail:
+	return err;
+}
+
+static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
+					 struct inode *inode)
+{
+	LIST_HEAD(data_buffers);
+	size_t n, rest = nilfs_segctor_buffer_rest(sci);
+	int err;
+
+	n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
+					    sci->sc_dsync_start,
+					    sci->sc_dsync_end);
+
+	err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
+					  nilfs_collect_file_data);
+	if (!err) {
+		nilfs_segctor_end_finfo(sci, inode);
+		BUG_ON(n > rest);
+		/* always receive -E2BIG or true error if n > rest */
+	}
+	return err;
+}
+
+static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct list_head *head;
+	struct nilfs_inode_info *ii;
+	int err = 0;
+
+	switch (sci->sc_stage.scnt) {
+	case NILFS_ST_INIT:
+		/* Pre-processes */
+		sci->sc_stage.flags = 0;
+
+		if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
+			sci->sc_nblk_inc = 0;
+			sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
+			if (mode == SC_LSEG_DSYNC) {
+				sci->sc_stage.scnt = NILFS_ST_DSYNC;
+				goto dsync_mode;
+			}
+		}
+
+		sci->sc_stage.dirty_file_ptr = NULL;
+		sci->sc_stage.gc_inode_ptr = NULL;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DAT;
+			goto dat_stage;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_GC:
+		if (nilfs_doing_gc()) {
+			head = &sci->sc_gc_inodes;
+			ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
+						head, i_dirty);
+			list_for_each_entry_continue(ii, head, i_dirty) {
+				err = nilfs_segctor_scan_file(
+					sci, &ii->vfs_inode,
+					&nilfs_sc_file_ops);
+				if (unlikely(err)) {
+					sci->sc_stage.gc_inode_ptr = list_entry(
+						ii->i_dirty.prev,
+						struct nilfs_inode_info,
+						i_dirty);
+					goto break_or_fail;
+				}
+				set_bit(NILFS_I_COLLECTED, &ii->i_state);
+			}
+			sci->sc_stage.gc_inode_ptr = NULL;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_FILE:
+		head = &sci->sc_dirty_files;
+		ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
+					i_dirty);
+		list_for_each_entry_continue(ii, head, i_dirty) {
+			clear_bit(NILFS_I_DIRTY, &ii->i_state);
+
+			err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
+						      &nilfs_sc_file_ops);
+			if (unlikely(err)) {
+				sci->sc_stage.dirty_file_ptr =
+					list_entry(ii->i_dirty.prev,
+						   struct nilfs_inode_info,
+						   i_dirty);
+				goto break_or_fail;
+			}
+			/* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
+			/* XXX: required ? */
+		}
+		sci->sc_stage.dirty_file_ptr = NULL;
+		if (mode == SC_FLUSH_FILE) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;
+		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
+		/* Fall through */
+	case NILFS_ST_IFILE:
+		err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;
+		/* Creating a checkpoint */
+		err = nilfs_segctor_create_checkpoint(sci);
+		if (unlikely(err))
+			break;
+		/* Fall through */
+	case NILFS_ST_CPFILE:
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SUFILE:
+		err = nilfs_segctor_prepare_free_segments(sci,
+							  nilfs->ns_sufile);
+		if (unlikely(err))
+			break;
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_DAT:
+ dat_stage:
+		err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+					      &nilfs_sc_dat_ops);
+		if (unlikely(err))
+			break;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SR:
+		if (mode == SC_LSEG_SR) {
+			/* Appending a super root */
+			err = nilfs_segctor_add_super_root(sci);
+			if (unlikely(err))
+				break;
+		}
+		/* End of a logical segment */
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DSYNC:
+ dsync_mode:
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
+		ii = sci->sc_dsync_inode;
+		if (!test_bit(NILFS_I_BUSY, &ii->i_state))
+			break;
+
+		err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
+		if (unlikely(err))
+			break;
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DONE:
+		return 0;
+	default:
+		BUG();
+	}
+
+ break_or_fail:
+	return err;
+}
+
+static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
+{
+	struct buffer_head *bh_su;
+	struct nilfs_segment_usage *raw_su;
+	int err;
+
+	err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
+	if (unlikely(err))
+		return err;
+	nilfs_mdt_mark_buffer_dirty(bh_su);
+	nilfs_mdt_mark_dirty(sufile);
+	nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
+	return 0;
+}
+
+static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
+					    struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf, *n;
+	__u64 nextnum;
+	int err;
+
+	if (list_empty(&sci->sc_segbufs)) {
+		segbuf = nilfs_segbuf_new(sci->sc_super);
+		if (unlikely(!segbuf))
+			return -ENOMEM;
+		list_add(&segbuf->sb_list, &sci->sc_segbufs);
+	} else
+		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+
+	nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
+			 nilfs);
+
+	if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+		nilfs_shift_to_next_segment(nilfs);
+		nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+	}
+	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
+
+	err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
+	if (unlikely(err))
+		return err;
+
+	if (nilfs->ns_segnum == nilfs->ns_nextnum) {
+		/* Start from the head of a new full segment */
+		err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
+		if (unlikely(err))
+			return err;
+	} else
+		nextnum = nilfs->ns_nextnum;
+
+	segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+	nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
+
+	/* truncating segment buffers */
+	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+					  sb_list) {
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_free(segbuf);
+	}
+	return 0;
+}
+
+static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
+					 struct the_nilfs *nilfs, int nadd)
+{
+	struct nilfs_segment_buffer *segbuf, *prev, *n;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 nextnextnum;
+	LIST_HEAD(list);
+	int err, ret, i;
+
+	prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	/*
+	 * Since the segment specified with nextnum might be allocated during
+	 * the previous construction, the buffer including its segusage may
+	 * not be dirty.  The following call ensures that the buffer is dirty
+	 * and will pin the buffer on memory until the sufile is written.
+	 */
+	err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+	if (unlikely(err))
+		return err;
+
+	for (i = 0; i < nadd; i++) {
+		/* extend segment info */
+		err = -ENOMEM;
+		segbuf = nilfs_segbuf_new(sci->sc_super);
+		if (unlikely(!segbuf))
+			goto failed;
+
+		/* map this buffer to region of segment on-disk */
+		nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+		sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
+
+		/* allocate the next next full segment */
+		err = nilfs_sufile_alloc(sufile, &nextnextnum);
+		if (unlikely(err))
+			goto failed_segbuf;
+
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
+		nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
+
+		list_add_tail(&segbuf->sb_list, &list);
+		prev = segbuf;
+	}
+	list_splice(&list, sci->sc_segbufs.prev);
+	return 0;
+
+ failed_segbuf:
+	nilfs_segbuf_free(segbuf);
+ failed:
+	list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_free(segbuf);
+	}
+	return err;
+}
+
+static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
+						   struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret, done = 0;
+
+	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
+		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+	}
+	if (segbuf->sb_io_error) {
+		/* Case 1: The first segment failed */
+		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
+			/* Case 1a:  Partial segment appended into an existing
+			   segment */
+			nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
+						segbuf->sb_fseg_end);
+		else /* Case 1b:  New full segment */
+			set_nilfs_discontinued(nilfs);
+		done++;
+	}
+
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+		if (!done && segbuf->sb_io_error) {
+			if (segbuf->sb_segnum != nilfs->ns_nextnum)
+				/* Case 2: extended segment (!= next) failed */
+				nilfs_sufile_set_error(nilfs->ns_sufile,
+						       segbuf->sb_segnum);
+			done++;
+		}
+	}
+}
+
+static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
+		nilfs_segbuf_clear(segbuf);
+	sci->sc_super_root = NULL;
+}
+
+static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	while (!list_empty(&sci->sc_segbufs)) {
+		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_free(segbuf);
+	}
+	/* sci->sc_curseg = NULL; */
+}
+
+static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
+					   struct the_nilfs *nilfs, int err)
+{
+	if (unlikely(err)) {
+		nilfs_segctor_free_incomplete_segments(sci, nilfs);
+		nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+	}
+	nilfs_segctor_clear_segment_buffers(sci);
+}
+
+static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
+					  struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct buffer_head *bh_su;
+	struct nilfs_segment_usage *raw_su;
+	unsigned long live_blocks;
+	int ret;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+						     &raw_su, &bh_su);
+		WARN_ON(ret); /* always succeed because bh_su is dirty */
+		live_blocks = segbuf->sb_sum.nblocks +
+			(segbuf->sb_pseg_start - segbuf->sb_fseg_start);
+		raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
+		raw_su->su_nblocks = cpu_to_le32(live_blocks);
+		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+					       bh_su);
+	}
+}
+
+static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
+					  struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct buffer_head *bh_su;
+	struct nilfs_segment_usage *raw_su;
+	int ret;
+
+	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+					     &raw_su, &bh_su);
+	WARN_ON(ret); /* always succeed because bh_su is dirty */
+	raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
+					 segbuf->sb_fseg_start);
+	nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
+
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+						     &raw_su, &bh_su);
+		WARN_ON(ret); /* always succeed */
+		raw_su->su_nblocks = 0;
+		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+					       bh_su);
+	}
+}
+
+static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
+					    struct nilfs_segment_buffer *last,
+					    struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf = last, *n;
+	int ret;
+
+	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+					  sb_list) {
+		list_del_init(&segbuf->sb_list);
+		sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret);
+		nilfs_segbuf_free(segbuf);
+	}
+}
+
+
+static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
+				 struct the_nilfs *nilfs, int mode)
+{
+	struct nilfs_cstage prev_stage = sci->sc_stage;
+	int err, nadd = 1;
+
+	/* Collection retry loop */
+	for (;;) {
+		sci->sc_super_root = NULL;
+		sci->sc_nblk_this_inc = 0;
+		sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+
+		err = nilfs_segctor_reset_segment_buffer(sci);
+		if (unlikely(err))
+			goto failed;
+
+		err = nilfs_segctor_collect_blocks(sci, mode);
+		sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+		if (!err)
+			break;
+
+		if (unlikely(err != -E2BIG))
+			goto failed;
+
+		/* The current segment is filled up */
+		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+			break;
+
+		nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+		nilfs_segctor_clear_segment_buffers(sci);
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
+		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
+		sci->sc_stage = prev_stage;
+	}
+	nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
+	return 0;
+
+ failed:
+	return err;
+}
+
+static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
+				      struct buffer_head *new_bh)
+{
+	BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
+
+	list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
+	/* The caller must release old_bh */
+}
+
+static int
+nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
+				     struct nilfs_segment_buffer *segbuf,
+				     int mode)
+{
+	struct inode *inode = NULL;
+	sector_t blocknr;
+	unsigned long nfinfo = segbuf->sb_sum.nfinfo;
+	unsigned long nblocks = 0, ndatablk = 0;
+	struct nilfs_sc_operations *sc_op = NULL;
+	struct nilfs_segsum_pointer ssp;
+	struct nilfs_finfo *finfo = NULL;
+	union nilfs_binfo binfo;
+	struct buffer_head *bh, *bh_org;
+	ino_t ino = 0;
+	int err = 0;
+
+	if (!nfinfo)
+		goto out;
+
+	blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
+	ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	ssp.offset = sizeof(struct nilfs_segment_summary);
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		if (bh == sci->sc_super_root)
+			break;
+		if (!finfo) {
+			finfo =	nilfs_segctor_map_segsum_entry(
+				sci, &ssp, sizeof(*finfo));
+			ino = le64_to_cpu(finfo->fi_ino);
+			nblocks = le32_to_cpu(finfo->fi_nblocks);
+			ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+
+			if (buffer_nilfs_node(bh))
+				inode = NILFS_BTNC_I(bh->b_page->mapping);
+			else
+				inode = NILFS_AS_I(bh->b_page->mapping);
+
+			if (mode == SC_LSEG_DSYNC)
+				sc_op = &nilfs_sc_dsync_ops;
+			else if (ino == NILFS_DAT_INO)
+				sc_op = &nilfs_sc_dat_ops;
+			else /* file blocks */
+				sc_op = &nilfs_sc_file_ops;
+		}
+		bh_org = bh;
+		get_bh(bh_org);
+		err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
+					&binfo);
+		if (bh != bh_org)
+			nilfs_list_replace_buffer(bh_org, bh);
+		brelse(bh_org);
+		if (unlikely(err))
+			goto failed_bmap;
+
+		if (ndatablk > 0)
+			sc_op->write_data_binfo(sci, &ssp, &binfo);
+		else
+			sc_op->write_node_binfo(sci, &ssp, &binfo);
+
+		blocknr++;
+		if (--nblocks == 0) {
+			finfo = NULL;
+			if (--nfinfo == 0)
+				break;
+		} else if (ndatablk > 0)
+			ndatablk--;
+	}
+ out:
+	return 0;
+
+ failed_bmap:
+	err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
+	return err;
+}
+
+static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
+		if (unlikely(err))
+			return err;
+		nilfs_segbuf_fill_in_segsum(segbuf);
+	}
+	return 0;
+}
+
+static int
+nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
+{
+	struct page *clone_page;
+	struct buffer_head *bh, *head, *bh2;
+	void *kaddr;
+
+	bh = head = page_buffers(page);
+
+	clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
+	if (unlikely(!clone_page))
+		return -ENOMEM;
+
+	bh2 = page_buffers(clone_page);
+	kaddr = kmap_atomic(page, KM_USER0);
+	do {
+		if (list_empty(&bh->b_assoc_buffers))
+			continue;
+		get_bh(bh2);
+		page_cache_get(clone_page); /* for each bh */
+		memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
+		bh2->b_blocknr = bh->b_blocknr;
+		list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
+		list_add_tail(&bh->b_assoc_buffers, out);
+	} while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (!TestSetPageWriteback(clone_page))
+		inc_zone_page_state(clone_page, NR_WRITEBACK);
+	unlock_page(clone_page);
+
+	return 0;
+}
+
+static int nilfs_test_page_to_be_frozen(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
+		return 0;
+
+	if (page_mapped(page)) {
+		ClearPageChecked(page);
+		return 1;
+	}
+	return PageChecked(page);
+}
+
+static int nilfs_begin_page_io(struct page *page, struct list_head *out)
+{
+	if (!page || PageWriteback(page))
+		/* For split b-tree node pages, this function may be called
+		   twice.  We ignore the 2nd or later calls by this check. */
+		return 0;
+
+	lock_page(page);
+	clear_page_dirty_for_io(page);
+	set_page_writeback(page);
+	unlock_page(page);
+
+	if (nilfs_test_page_to_be_frozen(page)) {
+		int err = nilfs_copy_replace_page_buffers(page, out);
+		if (unlikely(err))
+			return err;
+	}
+	return 0;
+}
+
+static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
+				       struct page **failed_page)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct list_head *list = &sci->sc_copied_buffers;
+	int err;
+
+	*failed_page = NULL;
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+				}
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			if (bh == sci->sc_super_root) {
+				if (bh->b_page != bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				err = nilfs_begin_page_io(fs_page, list);
+				if (unlikely(err)) {
+					*failed_page = fs_page;
+					goto out;
+				}
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page) {
+		lock_page(bd_page);
+		clear_page_dirty_for_io(bd_page);
+		set_page_writeback(bd_page);
+		unlock_page(bd_page);
+	}
+	err = nilfs_begin_page_io(fs_page, list);
+	if (unlikely(err))
+		*failed_page = fs_page;
+ out:
+	return err;
+}
+
+static int nilfs_segctor_write(struct nilfs_sc_info *sci,
+			       struct backing_dev_info *bdi)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct nilfs_write_info wi;
+	int err, res;
+
+	wi.sb = sci->sc_super;
+	wi.bh_sr = sci->sc_super_root;
+	wi.bdi = bdi;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		nilfs_segbuf_prepare_write(segbuf, &wi);
+		err = nilfs_segbuf_write(segbuf, &wi);
+
+		res = nilfs_segbuf_wait(segbuf, &wi);
+		err = unlikely(err) ? : res;
+		if (unlikely(err))
+			return err;
+	}
+	return 0;
+}
+
+static int nilfs_page_has_uncleared_buffer(struct page *page)
+{
+	struct buffer_head *head, *bh;
+
+	head = bh = page_buffers(page);
+	do {
+		if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
+			return 1;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return 0;
+}
+
+static void __nilfs_end_page_io(struct page *page, int err)
+{
+	if (!err) {
+		if (!nilfs_page_buffers_clean(page))
+			__set_page_dirty_nobuffers(page);
+		ClearPageError(page);
+	} else {
+		__set_page_dirty_nobuffers(page);
+		SetPageError(page);
+	}
+
+	if (buffer_nilfs_allocated(page_buffers(page))) {
+		if (TestClearPageWriteback(page))
+			dec_zone_page_state(page, NR_WRITEBACK);
+	} else
+		end_page_writeback(page);
+}
+
+static void nilfs_end_page_io(struct page *page, int err)
+{
+	if (!page)
+		return;
+
+	if (buffer_nilfs_node(page_buffers(page)) &&
+	    nilfs_page_has_uncleared_buffer(page))
+		/* For b-tree node pages, this function may be called twice
+		   or more because they might be split in a segment.
+		   This check assures that cleanup has been done for all
+		   buffers in a split btnode page. */
+		return;
+
+	__nilfs_end_page_io(page, err);
+}
+
+static void nilfs_clear_copied_buffers(struct list_head *list, int err)
+{
+	struct buffer_head *bh, *head;
+	struct page *page;
+
+	while (!list_empty(list)) {
+		bh = list_entry(list->next, struct buffer_head,
+				b_assoc_buffers);
+		page = bh->b_page;
+		page_cache_get(page);
+		head = bh = page_buffers(page);
+		do {
+			if (!list_empty(&bh->b_assoc_buffers)) {
+				list_del_init(&bh->b_assoc_buffers);
+				if (!err) {
+					set_buffer_uptodate(bh);
+					clear_buffer_dirty(bh);
+					clear_buffer_nilfs_volatile(bh);
+				}
+				brelse(bh); /* for b_assoc_buffers */
+			}
+		} while ((bh = bh->b_this_page) != head);
+
+		__nilfs_end_page_io(page, err);
+		page_cache_release(page);
+	}
+}
+
+static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
+				      struct page *failed_page, int err)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			if (bh == sci->sc_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, err);
+				if (unlikely(fs_page == failed_page))
+					goto done;
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, err);
+ done:
+	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+}
+
+static void nilfs_set_next_segment(struct the_nilfs *nilfs,
+				   struct nilfs_segment_buffer *segbuf)
+{
+	nilfs->ns_segnum = segbuf->sb_segnum;
+	nilfs->ns_nextnum = segbuf->sb_nextnum;
+	nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
+		+ segbuf->sb_sum.nblocks;
+	nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
+	nilfs->ns_ctime = segbuf->sb_sum.ctime;
+}
+
+static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int update_sr = (sci->sc_super_root != NULL);
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+		/*
+		 * We assume that the buffers which belong to the same page
+		 * continue over the buffer list.
+		 * Under this assumption, the last BHs of pages is
+		 * identifiable by the discontinuity of bh->b_page
+		 * (page != fs_page).
+		 *
+		 * For B-tree node blocks, however, this assumption is not
+		 * guaranteed.  The cleanup code of B-tree node pages needs
+		 * special care.
+		 */
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			clear_buffer_nilfs_volatile(bh);
+			if (bh == sci->sc_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, 0);
+				fs_page = bh->b_page;
+			}
+		}
+
+		if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
+			if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
+				set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+				sci->sc_lseg_stime = jiffies;
+			}
+			if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
+				clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+		}
+	}
+	/*
+	 * Since pages may continue over multiple segment buffers,
+	 * end of the last page must be checked outside of the loop.
+	 */
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, 0);
+
+	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
+
+	nilfs_drop_collected_inodes(&sci->sc_dirty_files);
+
+	if (nilfs_doing_gc()) {
+		nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
+		if (update_sr)
+			nilfs_commit_gcdat_inode(nilfs);
+	} else
+		nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
+
+	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
+
+	segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	nilfs_set_next_segment(nilfs, segbuf);
+
+	if (update_sr) {
+		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
+				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
+		sbi->s_super->s_dirt = 1;
+
+		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+		set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+	} else
+		clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+}
+
+static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
+					struct nilfs_sb_info *sbi)
+{
+	struct nilfs_inode_info *ii, *n;
+	__u64 cno = sbi->s_nilfs->ns_cno;
+
+	spin_lock(&sbi->s_inode_lock);
+ retry:
+	list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
+		if (!ii->i_bh) {
+			struct buffer_head *ibh;
+			int err;
+
+			spin_unlock(&sbi->s_inode_lock);
+			err = nilfs_ifile_get_inode_block(
+				sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+			if (unlikely(err)) {
+				nilfs_warning(sbi->s_super, __func__,
+					      "failed to get inode block.\n");
+				return err;
+			}
+			nilfs_mdt_mark_buffer_dirty(ibh);
+			nilfs_mdt_mark_dirty(sbi->s_ifile);
+			spin_lock(&sbi->s_inode_lock);
+			if (likely(!ii->i_bh))
+				ii->i_bh = ibh;
+			else
+				brelse(ibh);
+			goto retry;
+		}
+		ii->i_cno = cno;
+
+		clear_bit(NILFS_I_QUEUED, &ii->i_state);
+		set_bit(NILFS_I_BUSY, &ii->i_state);
+		list_del(&ii->i_dirty);
+		list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
+	}
+	spin_unlock(&sbi->s_inode_lock);
+
+	NILFS_I(sbi->s_ifile)->i_cno = cno;
+
+	return 0;
+}
+
+static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
+					  struct nilfs_sb_info *sbi)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct nilfs_inode_info *ii, *n;
+	__u64 cno = sbi->s_nilfs->ns_cno;
+
+	spin_lock(&sbi->s_inode_lock);
+	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
+		    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+			/* The current checkpoint number (=nilfs->ns_cno) is
+			   changed between check-in and check-out only if the
+			   super root is written out.  So, we can update i_cno
+			   for the inodes that remain in the dirty list. */
+			ii->i_cno = cno;
+			continue;
+		}
+		clear_bit(NILFS_I_BUSY, &ii->i_state);
+		brelse(ii->i_bh);
+		ii->i_bh = NULL;
+		list_del(&ii->i_dirty);
+		list_add_tail(&ii->i_dirty, &ti->ti_garbage);
+	}
+	spin_unlock(&sbi->s_inode_lock);
+}
+
+/*
+ * Main procedure of segment constructor
+ */
+static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct page *failed_page;
+	int err, has_sr = 0;
+
+	sci->sc_stage.scnt = NILFS_ST_INIT;
+
+	err = nilfs_segctor_check_in_files(sci, sbi);
+	if (unlikely(err))
+		goto out;
+
+	if (nilfs_test_metadata_dirty(sbi))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	if (nilfs_segctor_clean(sci))
+		goto out;
+
+	do {
+		sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
+
+		err = nilfs_segctor_begin_construction(sci, nilfs);
+		if (unlikely(err))
+			goto out;
+
+		/* Update time stamp */
+		sci->sc_seg_ctime = get_seconds();
+
+		err = nilfs_segctor_collect(sci, nilfs, mode);
+		if (unlikely(err))
+			goto failed;
+
+		has_sr = (sci->sc_super_root != NULL);
+
+		/* Avoid empty segment */
+		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+		    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
+			nilfs_segctor_end_construction(sci, nilfs, 1);
+			goto out;
+		}
+
+		err = nilfs_segctor_assign(sci, mode);
+		if (unlikely(err))
+			goto failed;
+
+		if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+			nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+
+		if (has_sr) {
+			err = nilfs_segctor_fill_in_checkpoint(sci);
+			if (unlikely(err))
+				goto failed_to_make_up;
+
+			nilfs_segctor_fill_in_super_root(sci, nilfs);
+		}
+		nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
+
+		/* Write partial segments */
+		err = nilfs_segctor_prepare_write(sci, &failed_page);
+		if (unlikely(err))
+			goto failed_to_write;
+
+		nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+
+		err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+		if (unlikely(err))
+			goto failed_to_write;
+
+		nilfs_segctor_complete_write(sci);
+
+		/* Commit segments */
+		if (has_sr) {
+			nilfs_segctor_commit_free_segments(sci);
+			nilfs_segctor_clear_metadata_dirty(sci);
+		}
+
+		nilfs_segctor_end_construction(sci, nilfs, 0);
+
+	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
+
+ out:
+	nilfs_segctor_destroy_segment_buffers(sci);
+	nilfs_segctor_check_out_files(sci, sbi);
+	return err;
+
+ failed_to_write:
+	nilfs_segctor_abort_write(sci, failed_page, err);
+	nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
+
+ failed_to_make_up:
+	if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+		nilfs_redirty_inodes(&sci->sc_dirty_files);
+
+ failed:
+	if (nilfs_doing_gc())
+		nilfs_redirty_inodes(&sci->sc_gc_inodes);
+	nilfs_segctor_end_construction(sci, nilfs, err);
+	goto out;
+}
+
+/**
+ * nilfs_secgtor_start_timer - set timer of background write
+ * @sci: nilfs_sc_info
+ *
+ * If the timer has already been set, it ignores the new request.
+ * This function MUST be called within a section locking the segment
+ * semaphore.
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+		sci->sc_timer->expires = jiffies + sci->sc_interval;
+		add_timer(sci->sc_timer);
+		sci->sc_state |= NILFS_SEGCTOR_COMMIT;
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (!(sci->sc_flush_request & (1 << bn))) {
+		unsigned long prev_req = sci->sc_flush_request;
+
+		sci->sc_flush_request |= (1 << bn);
+		if (!prev_req)
+			wake_up(&sci->sc_wait_daemon);
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+/**
+ * nilfs_flush_segment - trigger a segment construction for resource control
+ * @sb: super block
+ * @ino: inode number of the file to be flushed out.
+ */
+void nilfs_flush_segment(struct super_block *sb, ino_t ino)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+
+	if (!sci || nilfs_doing_construction())
+		return;
+	nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
+					/* assign bit 0 to data files */
+}
+
+int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
+					   __u64 *segnum, size_t nsegs)
+{
+	struct nilfs_segment_entry *ent;
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct inode *sufile = nilfs->ns_sufile;
+	LIST_HEAD(list);
+	__u64 *pnum;
+	size_t i;
+	int err;
+
+	for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
+		ent = nilfs_alloc_segment_entry(*pnum);
+		if (unlikely(!ent)) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		list_add_tail(&ent->list, &list);
+
+		err = nilfs_open_segment_entry(ent, sufile);
+		if (unlikely(err))
+			goto failed;
+
+		if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
+			printk(KERN_WARNING "NILFS: unused segment is "
+			       "requested to be cleaned (segnum=%llu)\n",
+			       (unsigned long long)ent->segnum);
+		nilfs_close_segment_entry(ent, sufile);
+	}
+	list_splice(&list, sci->sc_cleaning_segments.prev);
+	return 0;
+
+ failed:
+	nilfs_dispose_segment_list(&list);
+	return err;
+}
+
+void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
+{
+	nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+
+struct nilfs_segctor_wait_request {
+	wait_queue_t	wq;
+	__u32		seq;
+	int		err;
+	atomic_t	done;
+};
+
+static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segctor_wait_request wait_req;
+	int err = 0;
+
+	spin_lock(&sci->sc_state_lock);
+	init_wait(&wait_req.wq);
+	wait_req.err = 0;
+	atomic_set(&wait_req.done, 0);
+	wait_req.seq = ++sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+
+	init_waitqueue_entry(&wait_req.wq, current);
+	add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
+	set_current_state(TASK_INTERRUPTIBLE);
+	wake_up(&sci->sc_wait_daemon);
+
+	for (;;) {
+		if (atomic_read(&wait_req.done)) {
+			err = wait_req.err;
+			break;
+		}
+		if (!signal_pending(current)) {
+			schedule();
+			continue;
+		}
+		err = -ERESTARTSYS;
+		break;
+	}
+	finish_wait(&sci->sc_wait_request, &wait_req.wq);
+	return err;
+}
+
+static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
+{
+	struct nilfs_segctor_wait_request *wrq, *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
+	list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
+				 wq.task_list) {
+		if (!atomic_read(&wrq->done) &&
+		    nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
+			wrq->err = err;
+			atomic_set(&wrq->done, 1);
+		}
+		if (atomic_read(&wrq->done)) {
+			wrq->wq.func(&wrq->wq,
+				     TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+				     0, NULL);
+		}
+	}
+	spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
+}
+
+/**
+ * nilfs_construct_segment - construct a logical segment
+ * @sb: super block
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_segment(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct nilfs_transaction_info *ti;
+	int err;
+
+	if (!sci)
+		return -EROFS;
+
+	/* A call inside transactions causes a deadlock. */
+	BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
+
+	err = nilfs_segctor_sync(sci);
+	return err;
+}
+
+/**
+ * nilfs_construct_dsync_segment - construct a data-only logical segment
+ * @sb: super block
+ * @inode: inode whose data blocks should be written out
+ * @start: start byte offset
+ * @end: end byte offset (inclusive)
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
+				  loff_t start, loff_t end)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct nilfs_inode_info *ii;
+	struct nilfs_transaction_info ti;
+	int err = 0;
+
+	if (!sci)
+		return -EROFS;
+
+	nilfs_transaction_lock(sbi, &ti, 0);
+
+	ii = NILFS_I(inode);
+	if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
+	    nilfs_test_opt(sbi, STRICT_ORDER) ||
+	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    nilfs_discontinued(sbi->s_nilfs)) {
+		nilfs_transaction_unlock(sbi);
+		err = nilfs_segctor_sync(sci);
+		return err;
+	}
+
+	spin_lock(&sbi->s_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		spin_unlock(&sbi->s_inode_lock);
+		nilfs_transaction_unlock(sbi);
+		return 0;
+	}
+	spin_unlock(&sbi->s_inode_lock);
+	sci->sc_dsync_inode = ii;
+	sci->sc_dsync_start = start;
+	sci->sc_dsync_end = end;
+
+	err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+
+	nilfs_transaction_unlock(sbi);
+	return err;
+}
+
+struct nilfs_segctor_req {
+	int mode;
+	__u32 seq_accepted;
+	int sc_err;  /* construction failure */
+	int sb_err;  /* super block writeback failure */
+};
+
+#define FLUSH_FILE_BIT	(0x1) /* data file only */
+#define FLUSH_DAT_BIT	(1 << NILFS_DAT_INO) /* DAT only */
+
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
+				 struct nilfs_segctor_req *req)
+{
+	req->sc_err = req->sb_err = 0;
+	spin_lock(&sci->sc_state_lock);
+	req->seq_accepted = sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+
+	if (sci->sc_timer)
+		del_timer_sync(sci->sc_timer);
+}
+
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
+				 struct nilfs_segctor_req *req)
+{
+	/* Clear requests (even when the construction failed) */
+	spin_lock(&sci->sc_state_lock);
+
+	sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
+
+	if (req->mode == SC_LSEG_SR) {
+		sci->sc_seq_done = req->seq_accepted;
+		nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
+		sci->sc_flush_request = 0;
+	} else if (req->mode == SC_FLUSH_FILE)
+		sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+	else if (req->mode == SC_FLUSH_DAT)
+		sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+
+	spin_unlock(&sci->sc_state_lock);
+}
+
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
+				   struct nilfs_segctor_req *req)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err = 0;
+
+	if (nilfs_discontinued(nilfs))
+		req->mode = SC_LSEG_SR;
+	if (!nilfs_segctor_confirm(sci)) {
+		err = nilfs_segctor_do_construct(sci, req->mode);
+		req->sc_err = err;
+	}
+	if (likely(!err)) {
+		if (req->mode != SC_FLUSH_DAT)
+			atomic_set(&nilfs->ns_ndirtyblks, 0);
+		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
+		    nilfs_discontinued(nilfs)) {
+			down_write(&nilfs->ns_sem);
+			req->sb_err = nilfs_commit_super(sbi, 0);
+			up_write(&nilfs->ns_sem);
+		}
+	}
+	return err;
+}
+
+static void nilfs_construction_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
+static void
+nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
+{
+	struct nilfs_inode_info *ii, *n;
+
+	list_for_each_entry_safe(ii, n, head, i_dirty) {
+		if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
+			continue;
+		hlist_del_init(&ii->vfs_inode.i_hash);
+		list_del_init(&ii->i_dirty);
+		nilfs_clear_gcinode(&ii->vfs_inode);
+	}
+}
+
+int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
+			 void **kbufs)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_transaction_info ti;
+	struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+	int err;
+
+	if (unlikely(!sci))
+		return -EROFS;
+
+	nilfs_transaction_lock(sbi, &ti, 1);
+
+	err = nilfs_init_gcdat_inode(nilfs);
+	if (unlikely(err))
+		goto out_unlock;
+	err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
+	if (unlikely(err))
+		goto out_unlock;
+
+	list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+
+	for (;;) {
+		nilfs_segctor_accept(sci, &req);
+		err = nilfs_segctor_construct(sci, &req);
+		nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
+		nilfs_segctor_notify(sci, &req);
+
+		if (likely(!err))
+			break;
+
+		nilfs_warning(sb, __func__,
+			      "segment construction failed. (err=%d)", err);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(sci->sc_interval);
+	}
+
+ out_unlock:
+	nilfs_clear_gcdat_inode(nilfs);
+	nilfs_transaction_unlock(sbi);
+	return err;
+}
+
+static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct nilfs_transaction_info ti;
+	struct nilfs_segctor_req req = { .mode = mode };
+
+	nilfs_transaction_lock(sbi, &ti, 0);
+
+	nilfs_segctor_accept(sci, &req);
+	nilfs_segctor_construct(sci, &req);
+	nilfs_segctor_notify(sci, &req);
+
+	/*
+	 * Unclosed segment should be retried.  We do this using sc_timer.
+	 * Timeout of sc_timer will invoke complete construction which leads
+	 * to close the current logical segment.
+	 */
+	if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
+		nilfs_segctor_start_timer(sci);
+
+	nilfs_transaction_unlock(sbi);
+}
+
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
+{
+	int mode = 0;
+	int err;
+
+	spin_lock(&sci->sc_state_lock);
+	mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
+		SC_FLUSH_DAT : SC_FLUSH_FILE;
+	spin_unlock(&sci->sc_state_lock);
+
+	if (mode) {
+		err = nilfs_segctor_do_construct(sci, mode);
+
+		spin_lock(&sci->sc_state_lock);
+		sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
+			~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
+		spin_unlock(&sci->sc_state_lock);
+	}
+	clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+}
+
+static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
+{
+	if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
+		if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
+			return SC_FLUSH_FILE;
+		else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
+			return SC_FLUSH_DAT;
+	}
+	return SC_LSEG_SR;
+}
+
+/**
+ * nilfs_segctor_thread - main loop of the segment constructor thread.
+ * @arg: pointer to a struct nilfs_sc_info.
+ *
+ * nilfs_segctor_thread() initializes a timer and serves as a daemon
+ * to execute segment constructions.
+ */
+static int nilfs_segctor_thread(void *arg)
+{
+	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+	struct timer_list timer;
+	int timeout = 0;
+
+	init_timer(&timer);
+	timer.data = (unsigned long)current;
+	timer.function = nilfs_construction_timeout;
+	sci->sc_timer = &timer;
+
+	/* start sync. */
+	sci->sc_task = current;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
+	printk(KERN_INFO
+	       "segctord starting. Construction interval = %lu seconds, "
+	       "CP frequency < %lu seconds\n",
+	       sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+
+	spin_lock(&sci->sc_state_lock);
+ loop:
+	for (;;) {
+		int mode;
+
+		if (sci->sc_state & NILFS_SEGCTOR_QUIT)
+			goto end_thread;
+
+		if (timeout || sci->sc_seq_request != sci->sc_seq_done)
+			mode = SC_LSEG_SR;
+		else if (!sci->sc_flush_request)
+			break;
+		else
+			mode = nilfs_segctor_flush_mode(sci);
+
+		spin_unlock(&sci->sc_state_lock);
+		nilfs_segctor_thread_construct(sci, mode);
+		spin_lock(&sci->sc_state_lock);
+		timeout = 0;
+	}
+
+
+	if (freezing(current)) {
+		spin_unlock(&sci->sc_state_lock);
+		refrigerator();
+		spin_lock(&sci->sc_state_lock);
+	} else {
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&sci->sc_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+
+		if (sci->sc_seq_request != sci->sc_seq_done)
+			should_sleep = 0;
+		else if (sci->sc_flush_request)
+			should_sleep = 0;
+		else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
+			should_sleep = time_before(jiffies,
+						   sci->sc_timer->expires);
+
+		if (should_sleep) {
+			spin_unlock(&sci->sc_state_lock);
+			schedule();
+			spin_lock(&sci->sc_state_lock);
+		}
+		finish_wait(&sci->sc_wait_daemon, &wait);
+		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+			   time_after_eq(jiffies, sci->sc_timer->expires));
+	}
+	goto loop;
+
+ end_thread:
+	spin_unlock(&sci->sc_state_lock);
+	del_timer_sync(sci->sc_timer);
+	sci->sc_timer = NULL;
+
+	/* end sync. */
+	sci->sc_task = NULL;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+	return 0;
+}
+
+static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
+{
+	struct task_struct *t;
+
+	t = kthread_run(nilfs_segctor_thread, sci, "segctord");
+	if (IS_ERR(t)) {
+		int err = PTR_ERR(t);
+
+		printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
+		       err);
+		return err;
+	}
+	wait_event(sci->sc_wait_task, sci->sc_task != NULL);
+	return 0;
+}
+
+static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+{
+	sci->sc_state |= NILFS_SEGCTOR_QUIT;
+
+	while (sci->sc_task) {
+		wake_up(&sci->sc_wait_daemon);
+		spin_unlock(&sci->sc_state_lock);
+		wait_event(sci->sc_wait_task, sci->sc_task == NULL);
+		spin_lock(&sci->sc_state_lock);
+	}
+}
+
+static int nilfs_segctor_init(struct nilfs_sc_info *sci)
+{
+	sci->sc_seq_done = sci->sc_seq_request;
+
+	return nilfs_segctor_start_thread(sci);
+}
+
+/*
+ * Setup & clean-up functions
+ */
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+{
+	struct nilfs_sc_info *sci;
+
+	sci = kzalloc(sizeof(*sci), GFP_KERNEL);
+	if (!sci)
+		return NULL;
+
+	sci->sc_sbi = sbi;
+	sci->sc_super = sbi->s_super;
+
+	init_waitqueue_head(&sci->sc_wait_request);
+	init_waitqueue_head(&sci->sc_wait_daemon);
+	init_waitqueue_head(&sci->sc_wait_task);
+	spin_lock_init(&sci->sc_state_lock);
+	INIT_LIST_HEAD(&sci->sc_dirty_files);
+	INIT_LIST_HEAD(&sci->sc_segbufs);
+	INIT_LIST_HEAD(&sci->sc_gc_inodes);
+	INIT_LIST_HEAD(&sci->sc_cleaning_segments);
+	INIT_LIST_HEAD(&sci->sc_copied_buffers);
+
+	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
+	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
+	sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
+
+	if (sbi->s_interval)
+		sci->sc_interval = sbi->s_interval;
+	if (sbi->s_watermark)
+		sci->sc_watermark = sbi->s_watermark;
+	return sci;
+}
+
+static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
+{
+	int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
+
+	/* The segctord thread was stopped and its timer was removed.
+	   But some tasks remain. */
+	do {
+		struct nilfs_sb_info *sbi = sci->sc_sbi;
+		struct nilfs_transaction_info ti;
+		struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+
+		nilfs_transaction_lock(sbi, &ti, 0);
+		nilfs_segctor_accept(sci, &req);
+		ret = nilfs_segctor_construct(sci, &req);
+		nilfs_segctor_notify(sci, &req);
+		nilfs_transaction_unlock(sbi);
+
+	} while (ret && retrycount-- > 0);
+}
+
+/**
+ * nilfs_segctor_destroy - destroy the segment constructor.
+ * @sci: nilfs_sc_info
+ *
+ * nilfs_segctor_destroy() kills the segctord thread and frees
+ * the nilfs_sc_info struct.
+ * Caller must hold the segment semaphore.
+ */
+static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	int flag;
+
+	up_write(&sbi->s_nilfs->ns_segctor_sem);
+
+	spin_lock(&sci->sc_state_lock);
+	nilfs_segctor_kill_thread(sci);
+	flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
+		|| sci->sc_seq_request != sci->sc_seq_done);
+	spin_unlock(&sci->sc_state_lock);
+
+	if (flag || nilfs_segctor_confirm(sci))
+		nilfs_segctor_write_out(sci);
+
+	WARN_ON(!list_empty(&sci->sc_copied_buffers));
+
+	if (!list_empty(&sci->sc_dirty_files)) {
+		nilfs_warning(sbi->s_super, __func__,
+			      "dirty file(s) after the final construction\n");
+		nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
+	}
+
+	if (!list_empty(&sci->sc_cleaning_segments))
+		nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+
+	WARN_ON(!list_empty(&sci->sc_segbufs));
+
+	down_write(&sbi->s_nilfs->ns_segctor_sem);
+
+	kfree(sci);
+}
+
+/**
+ * nilfs_attach_segment_constructor - attach a segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
+ * initilizes it, and starts the segment constructor.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err;
+
+	/* Each field of nilfs_segctor is cleared through the initialization
+	   of super-block info */
+	sbi->s_sc_info = nilfs_segctor_new(sbi);
+	if (!sbi->s_sc_info)
+		return -ENOMEM;
+
+	nilfs_attach_writer(nilfs, sbi);
+	err = nilfs_segctor_init(NILFS_SC(sbi));
+	if (err) {
+		nilfs_detach_writer(nilfs, sbi);
+		kfree(sbi->s_sc_info);
+		sbi->s_sc_info = NULL;
+	}
+	return err;
+}
+
+/**
+ * nilfs_detach_segment_constructor - destroy the segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_detach_segment_constructor() kills the segment constructor daemon,
+ * frees the struct nilfs_sc_info, and destroy the dirty file list.
+ */
+void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	LIST_HEAD(garbage_list);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (NILFS_SC(sbi)) {
+		nilfs_segctor_destroy(NILFS_SC(sbi));
+		sbi->s_sc_info = NULL;
+	}
+
+	/* Force to free the list of dirty files */
+	spin_lock(&sbi->s_inode_lock);
+	if (!list_empty(&sbi->s_dirty_files)) {
+		list_splice_init(&sbi->s_dirty_files, &garbage_list);
+		nilfs_warning(sbi->s_super, __func__,
+			      "Non empty dirty list after the last "
+			      "segment construction\n");
+	}
+	spin_unlock(&sbi->s_inode_lock);
+	up_write(&nilfs->ns_segctor_sem);
+
+	nilfs_dispose_list(sbi, &garbage_list, 1);
+	nilfs_detach_writer(nilfs, sbi);
+}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 000000000000..476bdd5df5be
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,244 @@
+/*
+ * segment.h - NILFS Segment constructor prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGMENT_H
+#define _NILFS_SEGMENT_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sb.h"
+
+/**
+ * struct nilfs_recovery_info - Recovery infomation
+ * @ri_need_recovery: Recovery status
+ * @ri_super_root: Block number of the last super root
+ * @ri_ri_cno: Number of the last checkpoint
+ * @ri_lsegs_start: Region for roll-forwarding (start block number)
+ * @ri_lsegs_end: Region for roll-forwarding (end block number)
+ * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
+ * @ri_used_segments: List of segments to be mark active
+ * @ri_pseg_start: Block number of the last partial segment
+ * @ri_seq: Sequence number on the last partial segment
+ * @ri_segnum: Segment number on the last partial segment
+ * @ri_nextnum: Next segment number on the last partial segment
+ */
+struct nilfs_recovery_info {
+	int			ri_need_recovery;
+	sector_t		ri_super_root;
+	__u64			ri_cno;
+
+	sector_t		ri_lsegs_start;
+	sector_t		ri_lsegs_end;
+	u64			ri_lsegs_start_seq;
+	struct list_head	ri_used_segments;
+	sector_t		ri_pseg_start;
+	u64			ri_seq;
+	__u64			ri_segnum;
+	__u64			ri_nextnum;
+};
+
+/* ri_need_recovery */
+#define NILFS_RECOVERY_SR_UPDATED	 1  /* The super root was updated */
+#define NILFS_RECOVERY_ROLLFORWARD_DONE	 2  /* Rollforward was carried out */
+
+/**
+ * struct nilfs_cstage - Context of collection stage
+ * @scnt: Stage count
+ * @flags: State flags
+ * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
+ * @gc_inode_ptr: Pointer on the list of gc-inodes
+ */
+struct nilfs_cstage {
+	int			scnt;
+	unsigned 		flags;
+	struct nilfs_inode_info *dirty_file_ptr;
+	struct nilfs_inode_info *gc_inode_ptr;
+};
+
+struct nilfs_segment_buffer;
+
+struct nilfs_segsum_pointer {
+	struct buffer_head     *bh;
+	unsigned		offset; /* offset in bytes */
+};
+
+/**
+ * struct nilfs_sc_info - Segment constructor information
+ * @sc_super: Back pointer to super_block struct
+ * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_nblk_inc: Block count of current generation
+ * @sc_dirty_files: List of files to be written
+ * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_cleaning_segments: List of segments to be freed through construction
+ * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_dsync_inode: inode whose data pages are written for a sync operation
+ * @sc_dsync_start: start byte offset of data pages
+ * @sc_dsync_end: end byte offset of data pages (inclusive)
+ * @sc_segbufs: List of segment buffers
+ * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
+ * @sc_curseg: Current segment buffer
+ * @sc_super_root: Pointer to the super root buffer
+ * @sc_stage: Collection stage
+ * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
+ * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
+ * @sc_blk_cnt:	Block count of a file
+ * @sc_datablk_cnt: Data block count of a file
+ * @sc_nblk_this_inc: Number of blocks included in the current logical segment
+ * @sc_seg_ctime: Creation time
+ * @sc_flags: Internal flags
+ * @sc_state_lock: spinlock for sc_state and so on
+ * @sc_state: Segctord state flags
+ * @sc_flush_request: inode bitmap of metadata files to be flushed
+ * @sc_wait_request: Client request queue
+ * @sc_wait_daemon: Daemon wait queue
+ * @sc_wait_task: Start/end wait queue to control segctord task
+ * @sc_seq_request: Request counter
+ * @sc_seq_done: Completion counter
+ * @sc_sync: Request of explicit sync operation
+ * @sc_interval: Timeout value of background construction
+ * @sc_mjcp_freq: Frequency of creating checkpoints
+ * @sc_lseg_stime: Start time of the latest logical segment
+ * @sc_watermark: Watermark for the number of dirty buffers
+ * @sc_timer: Timer for segctord
+ * @sc_task: current thread of segctord
+ */
+struct nilfs_sc_info {
+	struct super_block     *sc_super;
+	struct nilfs_sb_info   *sc_sbi;
+
+	unsigned long		sc_nblk_inc;
+
+	struct list_head	sc_dirty_files;
+	struct list_head	sc_gc_inodes;
+	struct list_head	sc_cleaning_segments;
+	struct list_head	sc_copied_buffers;
+
+	struct nilfs_inode_info *sc_dsync_inode;
+	loff_t			sc_dsync_start;
+	loff_t			sc_dsync_end;
+
+	/* Segment buffers */
+	struct list_head	sc_segbufs;
+	unsigned long		sc_segbuf_nblocks;
+	struct nilfs_segment_buffer *sc_curseg;
+	struct buffer_head     *sc_super_root;
+
+	struct nilfs_cstage	sc_stage;
+
+	struct nilfs_segsum_pointer sc_finfo_ptr;
+	struct nilfs_segsum_pointer sc_binfo_ptr;
+	unsigned long		sc_blk_cnt;
+	unsigned long		sc_datablk_cnt;
+	unsigned long		sc_nblk_this_inc;
+	time_t			sc_seg_ctime;
+
+	unsigned long		sc_flags;
+
+	spinlock_t		sc_state_lock;
+	unsigned long		sc_state;
+	unsigned long		sc_flush_request;
+
+	wait_queue_head_t	sc_wait_request;
+	wait_queue_head_t	sc_wait_daemon;
+	wait_queue_head_t	sc_wait_task;
+
+	__u32			sc_seq_request;
+	__u32			sc_seq_done;
+
+	int			sc_sync;
+	unsigned long		sc_interval;
+	unsigned long		sc_mjcp_freq;
+	unsigned long		sc_lseg_stime;	/* in 1/HZ seconds */
+	unsigned long		sc_watermark;
+
+	struct timer_list      *sc_timer;
+	struct task_struct     *sc_task;
+};
+
+/* sc_flags */
+enum {
+	NILFS_SC_DIRTY,		/* One or more dirty meta-data blocks exist */
+	NILFS_SC_UNCLOSED,	/* Logical segment is not closed */
+	NILFS_SC_SUPER_ROOT,	/* The latest segment has a super root */
+	NILFS_SC_PRIOR_FLUSH,	/* Requesting immediate flush without making a
+				   checkpoint */
+	NILFS_SC_HAVE_DELTA,	/* Next checkpoint will have update of files
+				   other than DAT, cpfile, sufile, or files
+				   moved by GC */
+};
+
+/* sc_state */
+#define NILFS_SEGCTOR_QUIT	    0x0001  /* segctord is being destroyed */
+#define NILFS_SEGCTOR_COMMIT	    0x0004  /* committed transaction exists */
+
+/*
+ * Constant parameters
+ */
+#define NILFS_SC_CLEANUP_RETRY	    3  /* Retry count of construction when
+					  destroying segctord */
+
+/*
+ * Default values of timeout, in seconds.
+ */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
+					   It triggers construction of a
+					   logical segment with a super root */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
+					   creation */
+
+/*
+ * The default threshold amount of data, in block counts.
+ */
+#define NILFS_SC_DEFAULT_WATERMARK  3600
+
+
+/* segment.c */
+extern int nilfs_init_transaction_cache(void);
+extern void nilfs_destroy_transaction_cache(void);
+extern void nilfs_relax_pressure_in_lock(struct super_block *);
+
+extern int nilfs_construct_segment(struct super_block *);
+extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
+					 loff_t, loff_t);
+extern void nilfs_flush_segment(struct super_block *, ino_t);
+extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
+				void **);
+
+extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
+						  __u64 *, size_t);
+extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
+
+extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
+
+/* recovery.c */
+extern int nilfs_read_super_root_block(struct super_block *, sector_t,
+				       struct buffer_head **, int);
+extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
+				   struct nilfs_recovery_info *);
+extern int nilfs_recover_logical_segments(struct the_nilfs *,
+					  struct nilfs_sb_info *,
+					  struct nilfs_recovery_info *);
+
+#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 000000000000..98e68677f045
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,558 @@
+/*
+ * sufile.c - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "sufile.h"
+
+
+static inline unsigned long
+nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
+{
+	return NILFS_MDT(sufile)->mi_entries_per_block;
+}
+
+static unsigned long
+nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+	return (unsigned long)t;
+}
+
+static unsigned long
+nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+}
+
+static unsigned long
+nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
+				     __u64 max)
+{
+	return min_t(unsigned long,
+		     nilfs_sufile_segment_usages_per_block(sufile) -
+		     nilfs_sufile_get_offset(sufile, curr),
+		     max - curr + 1);
+}
+
+static inline struct nilfs_sufile_header *
+nilfs_sufile_block_get_header(const struct inode *sufile,
+			      struct buffer_head *bh,
+			      void *kaddr)
+{
+	return kaddr + bh_offset(bh);
+}
+
+static struct nilfs_segment_usage *
+nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
+				     struct buffer_head *bh, void *kaddr)
+{
+	return kaddr + bh_offset(bh) +
+		nilfs_sufile_get_offset(sufile, segnum) *
+		NILFS_MDT(sufile)->mi_entry_size;
+}
+
+static inline int nilfs_sufile_get_header_block(struct inode *sufile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+}
+
+static inline int
+nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
+				     int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile,
+				   nilfs_sufile_get_blkoff(sufile, segnum),
+				   create, NULL, bhp);
+}
+
+static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
+				     u64 ncleanadd, u64 ndirtyadd)
+{
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
+	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(header_bh);
+}
+
+int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *))
+{
+	struct buffer_head *header_bh, *bh;
+	int ret;
+
+	if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+		printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+		       __func__, (unsigned long long)segnum);
+		return -EINVAL;
+	}
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
+	if (!ret) {
+		dofunc(sufile, segnum, header_bh, bh);
+		brelse(bh);
+	}
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_alloc - allocate a segment
+ * @sufile: inode of segment usage file
+ * @segnump: pointer to segment number
+ *
+ * Description: nilfs_sufile_alloc() allocates a clean segment.
+ *
+ * Return Value: On success, 0 is returned and the segment number of the
+ * allocated segment is stored in the place pointed by @segnump. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No clean segment left.
+ */
+int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
+{
+	struct buffer_head *header_bh, *su_bh;
+	struct nilfs_sufile_header *header;
+	struct nilfs_segment_usage *su;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	__u64 segnum, maxsegnum, last_alloc;
+	void *kaddr;
+	unsigned long nsegments, ncleansegs, nsus;
+	int ret, i, j;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	last_alloc = le64_to_cpu(header->sh_last_alloc);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nsegments = nilfs_sufile_get_nsegments(sufile);
+	segnum = last_alloc + 1;
+	maxsegnum = nsegments - 1;
+	for (i = 0; i < nsegments; i += nsus) {
+		if (segnum >= nsegments) {
+			/* wrap around */
+			segnum = 0;
+			maxsegnum = last_alloc;
+		}
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
+							   &su_bh);
+		if (ret < 0)
+			goto out_header;
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+
+		nsus = nilfs_sufile_segment_usages_in_block(
+			sufile, segnum, maxsegnum);
+		for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
+			if (!nilfs_segment_usage_clean(su))
+				continue;
+			/* found a clean segment */
+			nilfs_segment_usage_set_dirty(su);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+			header = nilfs_sufile_block_get_header(
+				sufile, header_bh, kaddr);
+			le64_add_cpu(&header->sh_ncleansegs, -1);
+			le64_add_cpu(&header->sh_ndirtysegs, 1);
+			header->sh_last_alloc = cpu_to_le64(segnum);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			nilfs_mdt_mark_buffer_dirty(header_bh);
+			nilfs_mdt_mark_buffer_dirty(su_bh);
+			nilfs_mdt_mark_dirty(sufile);
+			brelse(su_bh);
+			*segnump = segnum;
+			goto out_header;
+		}
+
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(su_bh);
+	}
+
+	/* no segments left */
+	ret = -ENOSPC;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
+				 struct buffer_head *header_bh,
+				 struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (unlikely(!nilfs_segment_usage_clean(su))) {
+		printk(KERN_WARNING "%s: segment %llu must be clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	nilfs_segment_usage_set_dirty(su);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_sufile_mod_counter(header_bh, -1, 1);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
+			   struct buffer_head *header_bh,
+			   struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int clean, dirty;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+	    su->su_nblocks == cpu_to_le32(0)) {
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	clean = nilfs_segment_usage_clean(su);
+	dirty = nilfs_segment_usage_dirty(su);
+
+	/* make the segment garbage */
+	su->su_lastmod = cpu_to_le64(0);
+	su->su_nblocks = cpu_to_le32(0);
+	su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
+			  struct buffer_head *header_bh,
+			  struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int sudirty;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_clean(su)) {
+		printk(KERN_WARNING "%s: segment %llu is already clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	WARN_ON(nilfs_segment_usage_error(su));
+	WARN_ON(!nilfs_segment_usage_dirty(su));
+
+	sudirty = nilfs_segment_usage_dirty(su);
+	nilfs_segment_usage_set_clean(su);
+	kunmap_atomic(kaddr, KM_USER0);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+
+	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+ * nilfs_sufile_get_segment_usage - get a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @sup: pointer to segment usage
+ * @bhp: pointer to buffer head
+ *
+ * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
+ * specified by @segnum.
+ *
+ * Return Value: On success, 0 is returned, and the segment usage and the
+ * buffer head of the buffer on which the segment usage is located are stored
+ * in the place pointed by @sup and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
+				   struct nilfs_segment_usage **sup,
+				   struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int ret;
+
+	/* segnum is 0 origin */
+	if (segnum >= nilfs_sufile_get_nsegments(sufile))
+		return -EINVAL;
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap(bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	if (nilfs_segment_usage_error(su)) {
+		kunmap(bh->b_page);
+		brelse(bh);
+		ret = -EINVAL;
+		goto out_sem;
+	}
+
+	if (sup != NULL)
+		*sup = su;
+	*bhp = bh;
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_put_segment_usage - put a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @bh: buffer head
+ *
+ * Description: nilfs_sufile_put_segment_usage() releases the segment usage
+ * specified by @segnum. @bh must be the buffer head which have been returned
+ * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
+ */
+void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
+				    struct buffer_head *bh)
+{
+	kunmap(bh->b_page);
+	brelse(bh);
+}
+
+/**
+ * nilfs_sufile_get_stat - get segment usage statistics
+ * @sufile: inode of segment usage file
+ * @stat: pointer to a structure of segment usage statistics
+ *
+ * Description: nilfs_sufile_get_stat() returns information about segment
+ * usage.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
+{
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
+	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
+	sustat->ss_ctime = nilfs->ns_ctime;
+	sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sustat->ss_prot_seq = nilfs->ns_prot_seq;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(header_bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_get_ncleansegs - get the number of clean segments
+ * @sufile: inode of segment usage file
+ * @nsegsp: pointer to the number of clean segments
+ *
+ * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
+ * segments.
+ *
+ * Return Value: On success, 0 is returned and the number of clean segments is
+ * stored in the place pointed by @nsegsp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
+{
+	struct nilfs_sustat sustat;
+	int ret;
+
+	ret = nilfs_sufile_get_stat(sufile, &sustat);
+	if (ret == 0)
+		*nsegsp = sustat.ss_ncleansegs;
+	return ret;
+}
+
+void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
+			       struct buffer_head *header_bh,
+			       struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int suclean;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_error(su)) {
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	suclean = nilfs_segment_usage_clean(su);
+	nilfs_segment_usage_set_error(su);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (suclean)
+		nilfs_sufile_mod_counter(header_bh, -1, 0);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+ * nilfs_sufile_get_suinfo -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to start looking
+ * @si: array of suinfo
+ * @nsi: size of suinfo array
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned and .... On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
+				struct nilfs_suinfo *si, size_t nsi)
+{
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	void *kaddr;
+	unsigned long nsegs, segusages_per_block;
+	ssize_t n;
+	int ret, i, j;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+	nsegs = min_t(unsigned long,
+		      nilfs_sufile_get_nsegments(sufile) - segnum,
+		      nsi);
+	for (i = 0; i < nsegs; i += n, segnum += n) {
+		n = min_t(unsigned long,
+			  segusages_per_block -
+				  nilfs_sufile_get_offset(sufile, segnum),
+			  nsegs - i);
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out;
+			/* hole */
+			memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+			continue;
+		}
+
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+		for (j = 0; j < n; j++, su = (void *)su + susz) {
+			si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
+			si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
+			si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+				~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+			if (nilfs_segment_is_active(nilfs, segnum + j))
+				si[i + j].sui_flags |=
+					(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(su_bh);
+	}
+	ret = nsegs;
+
+ out:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 000000000000..a2e2efd4ade1
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,125 @@
+/*
+ * sufile.h - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_SUFILE_H
+#define _NILFS_SUFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+
+#define NILFS_SUFILE_GFP	NILFS_MDT_GFP
+
+static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
+{
+	return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+}
+
+int nilfs_sufile_alloc(struct inode *, __u64 *);
+int nilfs_sufile_get_segment_usage(struct inode *, __u64,
+				   struct nilfs_segment_usage **,
+				   struct buffer_head **);
+void nilfs_sufile_put_segment_usage(struct inode *, __u64,
+				    struct buffer_head *);
+int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
+int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+				size_t);
+
+int nilfs_sufile_update(struct inode *, __u64, int,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *));
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+				 struct buffer_head *);
+void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
+			   struct buffer_head *);
+void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
+			  struct buffer_head *);
+void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
+			       struct buffer_head *);
+
+/**
+ * nilfs_sufile_cancel_free -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0,
+				   nilfs_sufile_do_cancel_free);
+}
+
+/**
+ * nilfs_sufile_scrap - make a segment garbage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
+}
+
+/**
+ * nilfs_sufile_free - free segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
+}
+
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0,
+				   nilfs_sufile_do_set_error);
+}
+
+#endif	/* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 000000000000..6989b03e97ab
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1326 @@
+/*
+ * super.c - NILFS module and super block management.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/smp_lock.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/kobject.h>
+#include <linux/exportfs.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "page.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "dat.h"
+#include "segment.h"
+#include "segbuf.h"
+
+MODULE_AUTHOR("NTT Corp.");
+MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
+		   "(NILFS)");
+MODULE_LICENSE("GPL");
+
+static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+static int test_exclusive_mount(struct file_system_type *fs_type,
+				struct block_device *bdev, int flags);
+
+/**
+ * nilfs_error() - report failure condition on a filesystem
+ *
+ * nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message.  It should be called when NILFS detects
+ * incoherences or defects of meta data on disk.  As for sustainable
+ * errors such as a single-shot I/O error, nilfs_warning() or the printk()
+ * function should be used instead.
+ *
+ * The segment constructor must not call this function because it can
+ * kill itself.
+ */
+void nilfs_error(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		struct the_nilfs *nilfs = sbi->s_nilfs;
+
+		if (!nilfs_test_opt(sbi, ERRORS_CONT))
+			nilfs_detach_segment_constructor(sbi);
+
+		down_write(&nilfs->ns_sem);
+		if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+			nilfs->ns_mount_state |= NILFS_ERROR_FS;
+			nilfs->ns_sbp[0]->s_state |=
+				cpu_to_le16(NILFS_ERROR_FS);
+			nilfs_commit_super(sbi, 1);
+		}
+		up_write(&nilfs->ns_sem);
+
+		if (nilfs_test_opt(sbi, ERRORS_RO)) {
+			printk(KERN_CRIT "Remounting filesystem read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+	}
+
+	if (nilfs_test_opt(sbi, ERRORS_PANIC))
+		panic("NILFS (device %s): panic forced after error\n",
+		      sb->s_id);
+}
+
+void nilfs_warning(struct super_block *sb, const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_WARNING "NILFS warning (device %s): %s: ",
+	       sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+
+static struct kmem_cache *nilfs_inode_cachep;
+
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+	struct nilfs_inode_info *ii;
+
+	ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+	if (!ii)
+		return NULL;
+	ii->i_bh = NULL;
+	ii->i_state = 0;
+	ii->vfs_inode.i_version = 1;
+	nilfs_btnode_cache_init(&ii->i_btnode_cache);
+	return &ii->vfs_inode;
+}
+
+void nilfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
+}
+
+static void init_once(void *obj)
+{
+	struct nilfs_inode_info *ii = obj;
+
+	INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+	init_rwsem(&ii->xattr_sem);
+#endif
+	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+	inode_init_once(&ii->vfs_inode);
+}
+
+static int nilfs_init_inode_cache(void)
+{
+	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+					       sizeof(struct nilfs_inode_info),
+					       0, SLAB_RECLAIM_ACCOUNT,
+					       init_once);
+
+	return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
+}
+
+static inline void nilfs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(nilfs_inode_cachep);
+}
+
+static void nilfs_clear_inode(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+#ifdef CONFIG_NILFS_POSIX_ACL
+	if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
+		posix_acl_release(ii->i_acl);
+		ii->i_acl = NILFS_ACL_NOT_CACHED;
+	}
+	if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
+		posix_acl_release(ii->i_default_acl);
+		ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+	}
+#endif
+	/*
+	 * Free resources allocated in nilfs_read_inode(), here.
+	 */
+	BUG_ON(!list_empty(&ii->i_dirty));
+	brelse(ii->i_bh);
+	ii->i_bh = NULL;
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state))
+		nilfs_bmap_clear(ii->i_bmap);
+
+	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err;
+	int barrier_done = 0;
+
+	if (nilfs_test_opt(sbi, BARRIER)) {
+		set_buffer_ordered(nilfs->ns_sbh[0]);
+		barrier_done = 1;
+	}
+ retry:
+	set_buffer_dirty(nilfs->ns_sbh[0]);
+	err = sync_dirty_buffer(nilfs->ns_sbh[0]);
+	if (err == -EOPNOTSUPP && barrier_done) {
+		nilfs_warning(sbi->s_super, __func__,
+			      "barrier-based sync failed. "
+			      "disabling barriers\n");
+		nilfs_clear_opt(sbi, BARRIER);
+		barrier_done = 0;
+		clear_buffer_ordered(nilfs->ns_sbh[0]);
+		goto retry;
+	}
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: unable to write superblock (err=%d)\n", err);
+		if (err == -EIO && nilfs->ns_sbh[1]) {
+			nilfs_fall_back_super_block(nilfs);
+			goto retry;
+		}
+	} else {
+		struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+
+		/*
+		 * The latest segment becomes trailable from the position
+		 * written in superblock.
+		 */
+		clear_nilfs_discontinued(nilfs);
+
+		/* update GC protection for recent segments */
+		if (nilfs->ns_sbh[1]) {
+			sbp = NULL;
+			if (dupsb) {
+				set_buffer_dirty(nilfs->ns_sbh[1]);
+				if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
+					sbp = nilfs->ns_sbp[1];
+			}
+		}
+		if (sbp) {
+			spin_lock(&nilfs->ns_last_segment_lock);
+			nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+			spin_unlock(&nilfs->ns_last_segment_lock);
+		}
+	}
+
+	return err;
+}
+
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	sector_t nfreeblocks;
+	time_t t;
+	int err;
+
+	/* nilfs->sem must be locked by the caller. */
+	if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+		if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+			nilfs_swap_super_block(nilfs);
+		else {
+			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
+			       sbi->s_super->s_id);
+			return -EIO;
+		}
+	}
+	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: failed to count free blocks\n");
+		return err;
+	}
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+	sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+	sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+	spin_unlock(&nilfs->ns_last_segment_lock);
+
+	t = get_seconds();
+	nilfs->ns_sbwtime[0] = t;
+	sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+	sbp[0]->s_wtime = cpu_to_le64(t);
+	sbp[0]->s_sum = 0;
+	sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					     (unsigned char *)sbp[0],
+					     nilfs->ns_sbsize));
+	if (dupsb && sbp[1]) {
+		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+		nilfs->ns_sbwtime[1] = t;
+	}
+	sbi->s_super->s_dirt = 0;
+	return nilfs_sync_super(sbi, dupsb);
+}
+
+static void nilfs_put_super(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	nilfs_detach_segment_constructor(sbi);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		nilfs_commit_super(sbi, 1);
+		up_write(&nilfs->ns_sem);
+	}
+
+	nilfs_detach_checkpoint(sbi);
+	put_nilfs(sbi->s_nilfs);
+	sbi->s_super = NULL;
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+}
+
+/**
+ * nilfs_write_super - write super block(s) of NILFS
+ * @sb: super_block
+ *
+ * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
+ * clears s_dirt.  This function is called in the section protected by
+ * lock_super().
+ *
+ * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
+ * of the struct the_nilfs.  Lock order must be as follows:
+ *
+ *   1. lock_super()
+ *   2.    down_write(&nilfs->ns_sem)
+ *
+ * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
+ * of the super block (nilfs->ns_sbp[]).
+ *
+ * In most cases, VFS functions call lock_super() before calling these
+ * methods.  So we must be careful not to bring on deadlocks when using
+ * lock_super();  see generic_shutdown_super(), write_super(), and so on.
+ *
+ * Note that order of lock_kernel() and lock_super() depends on contexts
+ * of VFS.  We should also note that lock_kernel() can be used in its
+ * protective section and only the outermost one has an effect.
+ */
+static void nilfs_write_super(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	down_write(&nilfs->ns_sem);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		struct nilfs_super_block **sbp = nilfs->ns_sbp;
+		u64 t = get_seconds();
+		int dupsb;
+
+		if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
+		    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
+			up_write(&nilfs->ns_sem);
+			return;
+		}
+		dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+		nilfs_commit_super(sbi, dupsb);
+	}
+	sb->s_dirt = 0;
+	up_write(&nilfs->ns_sem);
+}
+
+static int nilfs_sync_fs(struct super_block *sb, int wait)
+{
+	int err = 0;
+
+	/* This function is called when super block should be written back */
+	if (wait)
+		err = nilfs_construct_segment(sb);
+	return err;
+}
+
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_checkpoint *raw_cp;
+	struct buffer_head *bh_cp;
+	int err;
+
+	down_write(&nilfs->ns_sem);
+	list_add(&sbi->s_list, &nilfs->ns_supers);
+	up_write(&nilfs->ns_sem);
+
+	sbi->s_ifile = nilfs_mdt_new(
+		nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+	if (!sbi->s_ifile)
+		return -ENOMEM;
+
+	err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
+	if (unlikely(err))
+		goto failed;
+
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
+					  &bh_cp);
+	if (unlikely(err)) {
+		if (err == -ENOENT || err == -EINVAL) {
+			printk(KERN_ERR
+			       "NILFS: Invalid checkpoint "
+			       "(checkpoint number=%llu)\n",
+			       (unsigned long long)cno);
+			err = -EINVAL;
+		}
+		goto failed;
+	}
+	err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
+	if (unlikely(err))
+		goto failed_bh;
+	atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+	return 0;
+
+ failed_bh:
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ failed:
+	nilfs_mdt_destroy(sbi->s_ifile);
+	sbi->s_ifile = NULL;
+
+	down_write(&nilfs->ns_sem);
+	list_del_init(&sbi->s_list);
+	up_write(&nilfs->ns_sem);
+
+	return err;
+}
+
+void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	nilfs_mdt_clear(sbi->s_ifile);
+	nilfs_mdt_destroy(sbi->s_ifile);
+	sbi->s_ifile = NULL;
+	down_write(&nilfs->ns_sem);
+	list_del_init(&sbi->s_list);
+	up_write(&nilfs->ns_sem);
+}
+
+static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err = 0;
+
+	down_write(&nilfs->ns_sem);
+	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+		nilfs->ns_mount_state |= NILFS_VALID_FS;
+		err = nilfs_commit_super(sbi, 1);
+		if (likely(!err))
+			printk(KERN_INFO "NILFS: recovery complete.\n");
+	}
+	up_write(&nilfs->ns_sem);
+	return err;
+}
+
+static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	unsigned long long blocks;
+	unsigned long overhead;
+	unsigned long nrsvblocks;
+	sector_t nfreeblocks;
+	int err;
+
+	/*
+	 * Compute all of the segment blocks
+	 *
+	 * The blocks before first segment and after last segment
+	 * are excluded.
+	 */
+	blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
+		- nilfs->ns_first_data_block;
+	nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
+
+	/*
+	 * Compute the overhead
+	 *
+	 * When distributing meta data blocks outside semgent structure,
+	 * We must count them as the overhead.
+	 */
+	overhead = 0;
+
+	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	if (unlikely(err))
+		return err;
+
+	buf->f_type = NILFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = blocks - overhead;
+	buf->f_bfree = nfreeblocks;
+	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
+		(buf->f_bfree - nrsvblocks) : 0;
+	buf->f_files = atomic_read(&sbi->s_inodes_count);
+	buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+	buf->f_namelen = NILFS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static struct super_operations nilfs_sops = {
+	.alloc_inode    = nilfs_alloc_inode,
+	.destroy_inode  = nilfs_destroy_inode,
+	.dirty_inode    = nilfs_dirty_inode,
+	/* .write_inode    = nilfs_write_inode, */
+	/* .put_inode      = nilfs_put_inode, */
+	/* .drop_inode	  = nilfs_drop_inode, */
+	.delete_inode   = nilfs_delete_inode,
+	.put_super      = nilfs_put_super,
+	.write_super    = nilfs_write_super,
+	.sync_fs        = nilfs_sync_fs,
+	/* .write_super_lockfs */
+	/* .unlockfs */
+	.statfs         = nilfs_statfs,
+	.remount_fs     = nilfs_remount,
+	.clear_inode    = nilfs_clear_inode,
+	/* .umount_begin */
+	/* .show_options */
+};
+
+static struct inode *
+nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
+	    ino != NILFS_SKETCH_INO)
+		return ERR_PTR(-ESTALE);
+
+	inode = nilfs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+static struct dentry *
+nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
+		   int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    nilfs_nfs_get_inode);
+}
+
+static struct dentry *
+nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
+		   int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    nilfs_nfs_get_inode);
+}
+
+static struct export_operations nilfs_export_ops = {
+	.fh_to_dentry = nilfs_fh_to_dentry,
+	.fh_to_parent = nilfs_fh_to_parent,
+	.get_parent = nilfs_get_parent,
+};
+
+enum {
+	Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_barrier, Opt_snapshot, Opt_order,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_barrier, "barrier=%s"},
+	{Opt_snapshot, "cp=%u"},
+	{Opt_order, "order=%s"},
+	{Opt_err, NULL}
+};
+
+static int match_bool(substring_t *s, int *result)
+{
+	int len = s->to - s->from;
+
+	if (strncmp(s->from, "on", len) == 0)
+		*result = 1;
+	else if (strncmp(s->from, "off", len) == 0)
+		*result = 0;
+	else
+		return 1;
+	return 0;
+}
+
+static int parse_options(char *options, struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_barrier:
+			if (match_bool(&args[0], &option))
+				return 0;
+			if (option)
+				nilfs_set_opt(sbi, BARRIER);
+			else
+				nilfs_clear_opt(sbi, BARRIER);
+			break;
+		case Opt_order:
+			if (strcmp(args[0].from, "relaxed") == 0)
+				/* Ordered data semantics */
+				nilfs_clear_opt(sbi, STRICT_ORDER);
+			else if (strcmp(args[0].from, "strict") == 0)
+				/* Strict in-order semantics */
+				nilfs_set_opt(sbi, STRICT_ORDER);
+			else
+				return 0;
+			break;
+		case Opt_err_panic:
+			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
+			break;
+		case Opt_snapshot:
+			if (match_int(&args[0], &option) || option <= 0)
+				return 0;
+			if (!(sb->s_flags & MS_RDONLY))
+				return 0;
+			sbi->s_snapshot_cno = option;
+			nilfs_set_opt(sbi, SNAPSHOT);
+			break;
+		default:
+			printk(KERN_ERR
+			       "NILFS: Unrecognized mount option \"%s\"\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static inline void
+nilfs_set_default_options(struct nilfs_sb_info *sbi,
+			  struct nilfs_super_block *sbp)
+{
+	sbi->s_mount_opt =
+		NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+}
+
+static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+	int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
+	int mnt_count = le16_to_cpu(sbp->s_mnt_count);
+
+	/* nilfs->sem must be locked by the caller. */
+	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+	} else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+		printk(KERN_WARNING
+		       "NILFS warning: mounting fs with errors\n");
+#if 0
+	} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
+		printk(KERN_WARNING
+		       "NILFS warning: maximal mount count reached\n");
+#endif
+	}
+	if (!max_mnt_count)
+		sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+
+	sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
+	sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
+	sbp->s_mtime = cpu_to_le64(get_seconds());
+	return nilfs_commit_super(sbi, 1);
+}
+
+struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
+						 u64 pos, int blocksize,
+						 struct buffer_head **pbh)
+{
+	unsigned long long sb_index = pos;
+	unsigned long offset;
+
+	offset = do_div(sb_index, blocksize);
+	*pbh = sb_bread(sb, sb_index);
+	if (!*pbh)
+		return NULL;
+	return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
+}
+
+int nilfs_store_magic_and_option(struct super_block *sb,
+				 struct nilfs_super_block *sbp,
+				 char *data)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+	sb->s_magic = le16_to_cpu(sbp->s_magic);
+
+	/* FS independent flags */
+#ifdef NILFS_ATIME_DISABLE
+	sb->s_flags |= MS_NOATIME;
+#endif
+
+	nilfs_set_default_options(sbi, sbp);
+
+	sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
+	sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
+	sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
+	sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
+
+	return !parse_options(data, sb) ? -EINVAL : 0 ;
+}
+
+/**
+ * nilfs_fill_super() - initialize a super block instance
+ * @sb: super_block
+ * @data: mount options
+ * @silent: silent mode flag
+ * @nilfs: the_nilfs struct
+ *
+ * This function is called exclusively by bd_mount_mutex.
+ * So, the recovery process is protected from other simultaneous mounts.
+ */
+static int
+nilfs_fill_super(struct super_block *sb, void *data, int silent,
+		 struct the_nilfs *nilfs)
+{
+	struct nilfs_sb_info *sbi;
+	struct inode *root;
+	__u64 cno;
+	int err;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	get_nilfs(nilfs);
+	sbi->s_nilfs = nilfs;
+	sbi->s_super = sb;
+
+	err = init_nilfs(nilfs, sbi, (char *)data);
+	if (err)
+		goto failed_sbi;
+
+	spin_lock_init(&sbi->s_inode_lock);
+	INIT_LIST_HEAD(&sbi->s_dirty_files);
+	INIT_LIST_HEAD(&sbi->s_list);
+
+	/*
+	 * Following initialization is overlapped because
+	 * nilfs_sb_info structure has been cleared at the beginning.
+	 * But we reserve them to keep our interest and make ready
+	 * for the future change.
+	 */
+	get_random_bytes(&sbi->s_next_generation,
+			 sizeof(sbi->s_next_generation));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	sb->s_op = &nilfs_sops;
+	sb->s_export_op = &nilfs_export_ops;
+	sb->s_root = NULL;
+	sb->s_time_gran = 1;
+
+	if (!nilfs_loaded(nilfs)) {
+		err = load_nilfs(nilfs, sbi);
+		if (err)
+			goto failed_sbi;
+	}
+	cno = nilfs_last_cno(nilfs);
+
+	if (sb->s_flags & MS_RDONLY) {
+		if (nilfs_test_opt(sbi, SNAPSHOT)) {
+			err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
+						       sbi->s_snapshot_cno);
+			if (err < 0)
+				goto failed_sbi;
+			if (!err) {
+				printk(KERN_ERR
+				       "NILFS: The specified checkpoint is "
+				       "not a snapshot "
+				       "(checkpoint number=%llu).\n",
+				       (unsigned long long)sbi->s_snapshot_cno);
+				err = -EINVAL;
+				goto failed_sbi;
+			}
+			cno = sbi->s_snapshot_cno;
+		} else
+			/* Read-only mount */
+			sbi->s_snapshot_cno = cno;
+	}
+
+	err = nilfs_attach_checkpoint(sbi, cno);
+	if (err) {
+		printk(KERN_ERR "NILFS: error loading a checkpoint"
+		       " (checkpoint number=%llu).\n", (unsigned long long)cno);
+		goto failed_sbi;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		err = nilfs_attach_segment_constructor(sbi);
+		if (err)
+			goto failed_checkpoint;
+	}
+
+	root = nilfs_iget(sb, NILFS_ROOT_INO);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "NILFS: get root inode failed\n");
+		err = PTR_ERR(root);
+		goto failed_segctor;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		iput(root);
+		printk(KERN_ERR "NILFS: corrupt root inode.\n");
+		err = -EINVAL;
+		goto failed_segctor;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		printk(KERN_ERR "NILFS: get root dentry failed\n");
+		err = -ENOMEM;
+		goto failed_segctor;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sbi);
+		up_write(&nilfs->ns_sem);
+	}
+
+	err = nilfs_mark_recovery_complete(sbi);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: recovery failed.\n");
+		goto failed_root;
+	}
+
+	return 0;
+
+ failed_root:
+	dput(sb->s_root);
+	sb->s_root = NULL;
+
+ failed_segctor:
+	nilfs_detach_segment_constructor(sbi);
+
+ failed_checkpoint:
+	nilfs_detach_checkpoint(sbi);
+
+ failed_sbi:
+	put_nilfs(nilfs);
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+	return err;
+}
+
+static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_super_block *sbp;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	unsigned long old_sb_flags;
+	struct nilfs_mount_options old_opts;
+	int err;
+
+	old_sb_flags = sb->s_flags;
+	old_opts.mount_opt = sbi->s_mount_opt;
+	old_opts.snapshot_cno = sbi->s_snapshot_cno;
+
+	if (!parse_options(data, sb)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+
+	if ((*flags & MS_RDONLY) &&
+	    sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+		printk(KERN_WARNING "NILFS (device %s): couldn't "
+		       "remount to a different snapshot. \n",
+		       sb->s_id);
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out;
+	if (*flags & MS_RDONLY) {
+		/* Shutting down the segment constructor */
+		nilfs_detach_segment_constructor(sbi);
+		sb->s_flags |= MS_RDONLY;
+
+		sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
+		/* nilfs_set_opt(sbi, SNAPSHOT); */
+
+		/*
+		 * Remounting a valid RW partition RDONLY, so set
+		 * the RDONLY flag and then mark the partition as valid again.
+		 */
+		down_write(&nilfs->ns_sem);
+		sbp = nilfs->ns_sbp[0];
+		if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
+		    (nilfs->ns_mount_state & NILFS_VALID_FS))
+			sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		sbp->s_mtime = cpu_to_le64(get_seconds());
+		nilfs_commit_super(sbi, 1);
+		up_write(&nilfs->ns_sem);
+	} else {
+		/*
+		 * Mounting a RDONLY partition read-write, so reread and
+		 * store the current valid flag.  (It may have been changed
+		 * by fsck since we originally mounted the partition.)
+		 */
+		down(&sb->s_bdev->bd_mount_sem);
+		/* Check existing RW-mount */
+		if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
+			printk(KERN_WARNING "NILFS (device %s): couldn't "
+			       "remount because a RW-mount exists.\n",
+			       sb->s_id);
+			err = -EBUSY;
+			goto rw_remount_failed;
+		}
+		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
+			printk(KERN_WARNING "NILFS (device %s): couldn't "
+			       "remount because the current RO-mount is not "
+			       "the latest one.\n",
+			       sb->s_id);
+			err = -EINVAL;
+			goto rw_remount_failed;
+		}
+		sb->s_flags &= ~MS_RDONLY;
+		nilfs_clear_opt(sbi, SNAPSHOT);
+		sbi->s_snapshot_cno = 0;
+
+		err = nilfs_attach_segment_constructor(sbi);
+		if (err)
+			goto rw_remount_failed;
+
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sbi);
+		up_write(&nilfs->ns_sem);
+
+		up(&sb->s_bdev->bd_mount_sem);
+	}
+ out:
+	return 0;
+
+ rw_remount_failed:
+	up(&sb->s_bdev->bd_mount_sem);
+ restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.mount_opt;
+	sbi->s_snapshot_cno = old_opts.snapshot_cno;
+	return err;
+}
+
+struct nilfs_super_data {
+	struct block_device *bdev;
+	__u64 cno;
+	int flags;
+};
+
+/**
+ * nilfs_identify - pre-read mount options needed to identify mount instance
+ * @data: mount options
+ * @sd: nilfs_super_data
+ */
+static int nilfs_identify(char *data, struct nilfs_super_data *sd)
+{
+	char *p, *options = data;
+	substring_t args[MAX_OPT_ARGS];
+	int option, token;
+	int ret = 0;
+
+	do {
+		p = strsep(&options, ",");
+		if (p != NULL && *p) {
+			token = match_token(p, tokens, args);
+			if (token == Opt_snapshot) {
+				if (!(sd->flags & MS_RDONLY))
+					ret++;
+				else {
+					ret = match_int(&args[0], &option);
+					if (!ret) {
+						if (option > 0)
+							sd->cno = option;
+						else
+							ret++;
+					}
+				}
+			}
+			if (ret)
+				printk(KERN_ERR
+				       "NILFS: invalid mount option: %s\n", p);
+		}
+		if (!options)
+			break;
+		BUG_ON(options == data);
+		*(options - 1) = ',';
+	} while (!ret);
+	return ret;
+}
+
+static int nilfs_set_bdev_super(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+
+	s->s_bdev = sd->bdev;
+	s->s_dev = s->s_bdev->bd_dev;
+	return 0;
+}
+
+static int nilfs_test_bdev_super(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+
+	return s->s_bdev == sd->bdev;
+}
+
+static int nilfs_test_bdev_super2(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+	int ret;
+
+	if (s->s_bdev != sd->bdev)
+		return 0;
+
+	if (!((s->s_flags | sd->flags) & MS_RDONLY))
+		return 1; /* Reuse an old R/W-mode super_block */
+
+	if (s->s_flags & sd->flags & MS_RDONLY) {
+		if (down_read_trylock(&s->s_umount)) {
+			ret = s->s_root &&
+				(sd->cno == NILFS_SB(s)->s_snapshot_cno);
+			up_read(&s->s_umount);
+			/*
+			 * This path is locked with sb_lock by sget().
+			 * So, drop_super() causes deadlock.
+			 */
+			return ret;
+		}
+	}
+	return 0;
+}
+
+static int
+nilfs_get_sb(struct file_system_type *fs_type, int flags,
+	     const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	struct nilfs_super_data sd;
+	struct super_block *s, *s2;
+	struct the_nilfs *nilfs = NULL;
+	int err, need_to_close = 1;
+
+	sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+	if (IS_ERR(sd.bdev))
+		return PTR_ERR(sd.bdev);
+
+	/*
+	 * To get mount instance using sget() vfs-routine, NILFS needs
+	 * much more information than normal filesystems to identify mount
+	 * instance.  For snapshot mounts, not only a mount type (ro-mount
+	 * or rw-mount) but also a checkpoint number is required.
+	 * The results are passed in sget() using nilfs_super_data.
+	 */
+	sd.cno = 0;
+	sd.flags = flags;
+	if (nilfs_identify((char *)data, &sd)) {
+		err = -EINVAL;
+		goto failed;
+	}
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	down(&sd.bdev->bd_mount_sem);
+	if (!sd.cno &&
+	    (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
+		err = (err < 0) ? : -EBUSY;
+		goto failed_unlock;
+	}
+
+	/*
+	 * Phase-1: search any existent instance and get the_nilfs
+	 */
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+	if (IS_ERR(s))
+		goto error_s;
+
+	if (!s->s_root) {
+		err = -ENOMEM;
+		nilfs = alloc_nilfs(sd.bdev);
+		if (!nilfs)
+			goto cancel_new;
+	} else {
+		struct nilfs_sb_info *sbi = NILFS_SB(s);
+
+		/*
+		 * s_umount protects super_block from unmount process;
+		 * It covers pointers of nilfs_sb_info and the_nilfs.
+		 */
+		nilfs = sbi->s_nilfs;
+		get_nilfs(nilfs);
+		up_write(&s->s_umount);
+
+		/*
+		 * Phase-2: search specified snapshot or R/W mode super_block
+		 */
+		if (!sd.cno)
+			/* trying to get the latest checkpoint.  */
+			sd.cno = nilfs_last_cno(nilfs);
+
+		s2 = sget(fs_type, nilfs_test_bdev_super2,
+			  nilfs_set_bdev_super, &sd);
+		deactivate_super(s);
+		/*
+		 * Although deactivate_super() invokes close_bdev_exclusive() at
+		 * kill_block_super().  Here, s is an existent mount; we need
+		 * one more close_bdev_exclusive() call.
+		 */
+		s = s2;
+		if (IS_ERR(s))
+			goto error_s;
+	}
+
+	if (!s->s_root) {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags;
+		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(sd.bdev));
+
+		err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+		if (err)
+			goto cancel_new;
+
+		s->s_flags |= MS_ACTIVE;
+		need_to_close = 0;
+	} else if (!(s->s_flags & MS_RDONLY)) {
+		err = -EBUSY;
+	}
+
+	up(&sd.bdev->bd_mount_sem);
+	put_nilfs(nilfs);
+	if (need_to_close)
+		close_bdev_exclusive(sd.bdev, flags);
+	simple_set_mnt(mnt, s);
+	return 0;
+
+ error_s:
+	up(&sd.bdev->bd_mount_sem);
+	if (nilfs)
+		put_nilfs(nilfs);
+	close_bdev_exclusive(sd.bdev, flags);
+	return PTR_ERR(s);
+
+ failed_unlock:
+	up(&sd.bdev->bd_mount_sem);
+ failed:
+	close_bdev_exclusive(sd.bdev, flags);
+
+	return err;
+
+ cancel_new:
+	/* Abandoning the newly allocated superblock */
+	up(&sd.bdev->bd_mount_sem);
+	if (nilfs)
+		put_nilfs(nilfs);
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	/*
+	 * deactivate_super() invokes close_bdev_exclusive().
+	 * We must finish all post-cleaning before this call;
+	 * put_nilfs() and unlocking bd_mount_sem need the block device.
+	 */
+	return err;
+}
+
+static int nilfs_test_bdev_super3(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+	int ret;
+
+	if (s->s_bdev != sd->bdev)
+		return 0;
+	if (down_read_trylock(&s->s_umount)) {
+		ret = (s->s_flags & MS_RDONLY) && s->s_root &&
+			nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
+		up_read(&s->s_umount);
+		if (ret)
+			return 0; /* ignore snapshot mounts */
+	}
+	return !((sd->flags ^ s->s_flags) & MS_RDONLY);
+}
+
+static int __false_bdev_super(struct super_block *s, void *data)
+{
+#if 0 /* XXX: workaround for lock debug. This is not good idea */
+	up_write(&s->s_umount);
+#endif
+	return -EFAULT;
+}
+
+/**
+ * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
+ * fs_type: filesystem type
+ * bdev: block device
+ * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
+ * res: pointer to an integer to store result
+ *
+ * This function must be called within a section protected by bd_mount_mutex.
+ */
+static int test_exclusive_mount(struct file_system_type *fs_type,
+				struct block_device *bdev, int flags)
+{
+	struct super_block *s;
+	struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
+
+	s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
+	if (IS_ERR(s)) {
+		if (PTR_ERR(s) != -EFAULT)
+			return PTR_ERR(s);
+		return 0;  /* Not found */
+	}
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return 1;  /* Found */
+}
+
+struct file_system_type nilfs_fs_type = {
+	.owner    = THIS_MODULE,
+	.name     = "nilfs2",
+	.get_sb   = nilfs_get_sb,
+	.kill_sb  = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_nilfs_fs(void)
+{
+	int err;
+
+	err = nilfs_init_inode_cache();
+	if (err)
+		goto failed;
+
+	err = nilfs_init_transaction_cache();
+	if (err)
+		goto failed_inode_cache;
+
+	err = nilfs_init_segbuf_cache();
+	if (err)
+		goto failed_transaction_cache;
+
+	err = nilfs_btree_path_cache_init();
+	if (err)
+		goto failed_segbuf_cache;
+
+	err = register_filesystem(&nilfs_fs_type);
+	if (err)
+		goto failed_btree_path_cache;
+
+	return 0;
+
+ failed_btree_path_cache:
+	nilfs_btree_path_cache_destroy();
+
+ failed_segbuf_cache:
+	nilfs_destroy_segbuf_cache();
+
+ failed_transaction_cache:
+	nilfs_destroy_transaction_cache();
+
+ failed_inode_cache:
+	nilfs_destroy_inode_cache();
+
+ failed:
+	return err;
+}
+
+static void __exit exit_nilfs_fs(void)
+{
+	nilfs_destroy_segbuf_cache();
+	nilfs_destroy_transaction_cache();
+	nilfs_destroy_inode_cache();
+	nilfs_btree_path_cache_destroy();
+	unregister_filesystem(&nilfs_fs_type);
+}
+
+module_init(init_nilfs_fs)
+module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000000..7f65b3be4aa9
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,641 @@
+/*
+ * the_nilfs.c - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "alloc.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+#include "seglist.h"
+#include "segbuf.h"
+
+void nilfs_set_last_segment(struct the_nilfs *nilfs,
+			    sector_t start_blocknr, u64 seq, __u64 cno)
+{
+	spin_lock(&nilfs->ns_last_segment_lock);
+	nilfs->ns_last_pseg = start_blocknr;
+	nilfs->ns_last_seq = seq;
+	nilfs->ns_last_cno = cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+}
+
+/**
+ * alloc_nilfs - allocate the_nilfs structure
+ * @bdev: block device to which the_nilfs is related
+ *
+ * alloc_nilfs() allocates memory for the_nilfs and
+ * initializes its reference count and locks.
+ *
+ * Return Value: On success, pointer to the_nilfs is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+{
+	struct the_nilfs *nilfs;
+
+	nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
+	if (!nilfs)
+		return NULL;
+
+	nilfs->ns_bdev = bdev;
+	atomic_set(&nilfs->ns_count, 1);
+	atomic_set(&nilfs->ns_writer_refcount, -1);
+	atomic_set(&nilfs->ns_ndirtyblks, 0);
+	init_rwsem(&nilfs->ns_sem);
+	mutex_init(&nilfs->ns_writer_mutex);
+	INIT_LIST_HEAD(&nilfs->ns_supers);
+	spin_lock_init(&nilfs->ns_last_segment_lock);
+	nilfs->ns_gc_inodes_h = NULL;
+	init_rwsem(&nilfs->ns_segctor_sem);
+
+	return nilfs;
+}
+
+/**
+ * put_nilfs - release a reference to the_nilfs
+ * @nilfs: the_nilfs structure to be released
+ *
+ * put_nilfs() decrements a reference counter of the_nilfs.
+ * If the reference count reaches zero, the_nilfs is freed.
+ */
+void put_nilfs(struct the_nilfs *nilfs)
+{
+	if (!atomic_dec_and_test(&nilfs->ns_count))
+		return;
+	/*
+	 * Increment of ns_count never occur below because the caller
+	 * of get_nilfs() holds at least one reference to the_nilfs.
+	 * Thus its exclusion control is not required here.
+	 */
+	might_sleep();
+	if (nilfs_loaded(nilfs)) {
+		nilfs_mdt_clear(nilfs->ns_sufile);
+		nilfs_mdt_destroy(nilfs->ns_sufile);
+		nilfs_mdt_clear(nilfs->ns_cpfile);
+		nilfs_mdt_destroy(nilfs->ns_cpfile);
+		nilfs_mdt_clear(nilfs->ns_dat);
+		nilfs_mdt_destroy(nilfs->ns_dat);
+		/* XXX: how and when to clear nilfs->ns_gc_dat? */
+		nilfs_mdt_destroy(nilfs->ns_gc_dat);
+	}
+	if (nilfs_init(nilfs)) {
+		nilfs_destroy_gccache(nilfs);
+		brelse(nilfs->ns_sbh[0]);
+		brelse(nilfs->ns_sbh[1]);
+	}
+	kfree(nilfs);
+}
+
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+				 struct nilfs_sb_info *sbi, sector_t sr_block)
+{
+	static struct lock_class_key dat_lock_key;
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *raw_sr;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
+	unsigned inode_size;
+	int err;
+
+	err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
+	if (unlikely(err))
+		return err;
+
+	down_read(&nilfs->ns_sem);
+	dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
+	checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
+	segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
+	up_read(&nilfs->ns_sem);
+
+	inode_size = nilfs->ns_inode_size;
+
+	err = -ENOMEM;
+	nilfs->ns_dat = nilfs_mdt_new(
+		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	if (unlikely(!nilfs->ns_dat))
+		goto failed;
+
+	nilfs->ns_gc_dat = nilfs_mdt_new(
+		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	if (unlikely(!nilfs->ns_gc_dat))
+		goto failed_dat;
+
+	nilfs->ns_cpfile = nilfs_mdt_new(
+		nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+	if (unlikely(!nilfs->ns_cpfile))
+		goto failed_gc_dat;
+
+	nilfs->ns_sufile = nilfs_mdt_new(
+		nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+	if (unlikely(!nilfs->ns_sufile))
+		goto failed_cpfile;
+
+	err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
+	if (unlikely(err))
+		goto failed_sufile;
+
+	err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
+	if (unlikely(err))
+		goto failed_sufile;
+
+	lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
+	lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
+
+	nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
+	nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
+				 sizeof(struct nilfs_cpfile_header));
+	nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
+				 sizeof(struct nilfs_sufile_header));
+
+	err = nilfs_mdt_read_inode_direct(
+		nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+	if (unlikely(err))
+		goto failed_sufile;
+
+	err = nilfs_mdt_read_inode_direct(
+		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+	if (unlikely(err))
+		goto failed_sufile;
+
+	err = nilfs_mdt_read_inode_direct(
+		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+	if (unlikely(err))
+		goto failed_sufile;
+
+	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+	nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
+
+ failed:
+	brelse(bh_sr);
+	return err;
+
+ failed_sufile:
+	nilfs_mdt_destroy(nilfs->ns_sufile);
+
+ failed_cpfile:
+	nilfs_mdt_destroy(nilfs->ns_cpfile);
+
+ failed_gc_dat:
+	nilfs_mdt_destroy(nilfs->ns_gc_dat);
+
+ failed_dat:
+	nilfs_mdt_destroy(nilfs->ns_dat);
+	goto failed;
+}
+
+static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
+{
+	memset(ri, 0, sizeof(*ri));
+	INIT_LIST_HEAD(&ri->ri_used_segments);
+}
+
+static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
+{
+	nilfs_dispose_segment_list(&ri->ri_used_segments);
+}
+
+/**
+ * load_nilfs - load and recover the nilfs
+ * @nilfs: the_nilfs structure to be released
+ * @sbi: nilfs_sb_info used to recover past segment
+ *
+ * load_nilfs() searches and load the latest super root,
+ * attaches the last segment, and does recovery if needed.
+ * The caller must call this exclusively for simultaneous mounts.
+ */
+int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+	struct nilfs_recovery_info ri;
+	unsigned int s_flags = sbi->s_super->s_flags;
+	int really_read_only = bdev_read_only(nilfs->ns_bdev);
+	unsigned valid_fs;
+	int err = 0;
+
+	nilfs_init_recovery_info(&ri);
+
+	down_write(&nilfs->ns_sem);
+	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+	up_write(&nilfs->ns_sem);
+
+	if (!valid_fs && (s_flags & MS_RDONLY)) {
+		printk(KERN_INFO "NILFS: INFO: recovery "
+		       "required for readonly filesystem.\n");
+		if (really_read_only) {
+			printk(KERN_ERR "NILFS: write access "
+			       "unavailable, cannot proceed.\n");
+			err = -EROFS;
+			goto failed;
+		}
+		printk(KERN_INFO "NILFS: write access will "
+		       "be enabled during recovery.\n");
+		sbi->s_super->s_flags &= ~MS_RDONLY;
+	}
+
+	err = nilfs_search_super_root(nilfs, sbi, &ri);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: error searching super root.\n");
+		goto failed;
+	}
+
+	err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: error loading super root.\n");
+		goto failed;
+	}
+
+	if (!valid_fs) {
+		err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+		if (unlikely(err)) {
+			nilfs_mdt_destroy(nilfs->ns_cpfile);
+			nilfs_mdt_destroy(nilfs->ns_sufile);
+			nilfs_mdt_destroy(nilfs->ns_dat);
+			goto failed;
+		}
+		if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
+			sbi->s_super->s_dirt = 1;
+	}
+
+	set_nilfs_loaded(nilfs);
+
+ failed:
+	nilfs_clear_recovery_info(&ri);
+	sbi->s_super->s_flags = s_flags;
+	return err;
+}
+
+static unsigned long long nilfs_max_size(unsigned int blkbits)
+{
+	unsigned int max_bits;
+	unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
+
+	max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
+	if (max_bits < 64)
+		res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
+	return res;
+}
+
+static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
+				   struct nilfs_super_block *sbp)
+{
+	if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
+		printk(KERN_ERR "NILFS: revision mismatch "
+		       "(superblock rev.=%d.%d, current rev.=%d.%d). "
+		       "Please check the version of mkfs.nilfs.\n",
+		       le32_to_cpu(sbp->s_rev_level),
+		       le16_to_cpu(sbp->s_minor_rev_level),
+		       NILFS_CURRENT_REV, NILFS_MINOR_REV);
+		return -EINVAL;
+	}
+	nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
+	if (nilfs->ns_sbsize > BLOCK_SIZE)
+		return -EINVAL;
+
+	nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+	nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+
+	nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+	if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
+		printk(KERN_ERR "NILFS: too short segment. \n");
+		return -EINVAL;
+	}
+
+	nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
+	nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
+	nilfs->ns_r_segments_percentage =
+		le32_to_cpu(sbp->s_r_segments_percentage);
+	nilfs->ns_nrsvsegs =
+		max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+		      DIV_ROUND_UP(nilfs->ns_nsegments *
+				   nilfs->ns_r_segments_percentage, 100));
+	nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
+	return 0;
+}
+
+static int nilfs_valid_sb(struct nilfs_super_block *sbp)
+{
+	static unsigned char sum[4];
+	const int sumoff = offsetof(struct nilfs_super_block, s_sum);
+	size_t bytes;
+	u32 crc;
+
+	if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+		return 0;
+	bytes = le16_to_cpu(sbp->s_bytes);
+	if (bytes > BLOCK_SIZE)
+		return 0;
+	crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+		       sumoff);
+	crc = crc32_le(crc, sum, 4);
+	crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
+		       bytes - sumoff - 4);
+	return crc == le32_to_cpu(sbp->s_sum);
+}
+
+static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+{
+	return offset < ((le64_to_cpu(sbp->s_nsegments) *
+			  le32_to_cpu(sbp->s_blocks_per_segment)) <<
+			 (le32_to_cpu(sbp->s_log_block_size) + 10));
+}
+
+static void nilfs_release_super_block(struct the_nilfs *nilfs)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		if (nilfs->ns_sbp[i]) {
+			brelse(nilfs->ns_sbh[i]);
+			nilfs->ns_sbh[i] = NULL;
+			nilfs->ns_sbp[i] = NULL;
+		}
+	}
+}
+
+void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
+{
+	brelse(nilfs->ns_sbh[0]);
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = NULL;
+	nilfs->ns_sbp[1] = NULL;
+}
+
+void nilfs_swap_super_block(struct the_nilfs *nilfs)
+{
+	struct buffer_head *tsbh = nilfs->ns_sbh[0];
+	struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
+
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = tsbh;
+	nilfs->ns_sbp[1] = tsbp;
+}
+
+static int nilfs_load_super_block(struct the_nilfs *nilfs,
+				  struct super_block *sb, int blocksize,
+				  struct nilfs_super_block **sbpp)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct buffer_head **sbh = nilfs->ns_sbh;
+	u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+	int valid[2], swp = 0;
+
+	sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
+					&sbh[0]);
+	sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
+
+	if (!sbp[0]) {
+		if (!sbp[1]) {
+			printk(KERN_ERR "NILFS: unable to read superblock\n");
+			return -EIO;
+		}
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read primary superblock\n");
+	} else if (!sbp[1])
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read secondary superblock\n");
+
+	valid[0] = nilfs_valid_sb(sbp[0]);
+	valid[1] = nilfs_valid_sb(sbp[1]);
+	swp = valid[1] &&
+		(!valid[0] ||
+		 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+
+	if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
+		brelse(sbh[1]);
+		sbh[1] = NULL;
+		sbp[1] = NULL;
+		swp = 0;
+	}
+	if (!valid[swp]) {
+		nilfs_release_super_block(nilfs);
+		printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
+		       sb->s_id);
+		return -EINVAL;
+	}
+
+	if (swp) {
+		printk(KERN_WARNING "NILFS warning: broken superblock. "
+		       "using spare superblock.\n");
+		nilfs_swap_super_block(nilfs);
+	}
+
+	nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
+	nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+	nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+	*sbpp = sbp[0];
+	return 0;
+}
+
+/**
+ * init_nilfs - initialize a NILFS instance.
+ * @nilfs: the_nilfs structure
+ * @sbi: nilfs_sb_info
+ * @sb: super block
+ * @data: mount options
+ *
+ * init_nilfs() performs common initialization per block device (e.g.
+ * reading the super block, getting disk layout information, initializing
+ * shared fields in the_nilfs). It takes on some portion of the jobs
+ * typically done by a fill_super() routine. This division arises from
+ * the nature that multiple NILFS instances may be simultaneously
+ * mounted on a device.
+ * For multiple mounts on the same device, only the first mount
+ * invokes these tasks.
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error
+ * code is returned.
+ */
+int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
+{
+	struct super_block *sb = sbi->s_super;
+	struct nilfs_super_block *sbp;
+	struct backing_dev_info *bdi;
+	int blocksize;
+	int err;
+
+	down_write(&nilfs->ns_sem);
+	if (nilfs_init(nilfs)) {
+		/* Load values from existing the_nilfs */
+		sbp = nilfs->ns_sbp[0];
+		err = nilfs_store_magic_and_option(sb, sbp, data);
+		if (err)
+			goto out;
+
+		blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+		if (sb->s_blocksize != blocksize &&
+		    !sb_set_blocksize(sb, blocksize)) {
+			printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
+			       blocksize);
+			err = -EINVAL;
+		}
+		sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+		goto out;
+	}
+
+	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	if (!blocksize) {
+		printk(KERN_ERR "NILFS: unable to set blocksize\n");
+		err = -EINVAL;
+		goto out;
+	}
+	err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+	if (err)
+		goto out;
+
+	err = nilfs_store_magic_and_option(sb, sbp, data);
+	if (err)
+		goto failed_sbh;
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+	if (sb->s_blocksize != blocksize) {
+		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+
+		if (blocksize < hw_blocksize) {
+			printk(KERN_ERR
+			       "NILFS: blocksize %d too small for device "
+			       "(sector-size = %d).\n",
+			       blocksize, hw_blocksize);
+			err = -EINVAL;
+			goto failed_sbh;
+		}
+		nilfs_release_super_block(nilfs);
+		sb_set_blocksize(sb, blocksize);
+
+		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+		if (err)
+			goto out;
+			/* not failed_sbh; sbh is released automatically
+			   when reloading fails. */
+	}
+	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+
+	err = nilfs_store_disk_layout(nilfs, sbp);
+	if (err)
+		goto failed_sbh;
+
+	sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+
+	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
+
+	bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
+	if (!bdi)
+		bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+	nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
+
+	/* Finding last segment */
+	nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+	nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+	nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+
+	nilfs->ns_seg_seq = nilfs->ns_last_seq;
+	nilfs->ns_segnum =
+		nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+	nilfs->ns_cno = nilfs->ns_last_cno + 1;
+	if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+		printk(KERN_ERR "NILFS invalid last segment number.\n");
+		err = -EINVAL;
+		goto failed_sbh;
+	}
+	/* Dummy values  */
+	nilfs->ns_free_segments_count =
+		nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
+
+	/* Initialize gcinode cache */
+	err = nilfs_init_gccache(nilfs);
+	if (err)
+		goto failed_sbh;
+
+	set_nilfs_init(nilfs);
+	err = 0;
+ out:
+	up_write(&nilfs->ns_sem);
+	return err;
+
+ failed_sbh:
+	nilfs_release_super_block(nilfs);
+	goto out;
+}
+
+int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
+{
+	struct inode *dat = nilfs_dat_inode(nilfs);
+	unsigned long ncleansegs;
+	int err;
+
+	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	if (likely(!err))
+		*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+	return err;
+}
+
+int nilfs_near_disk_full(struct the_nilfs *nilfs)
+{
+	struct inode *sufile = nilfs->ns_sufile;
+	unsigned long ncleansegs, nincsegs;
+	int ret;
+
+	ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
+	if (likely(!ret)) {
+		nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+			nilfs->ns_blocks_per_segment + 1;
+		if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
+			ret++;
+	}
+	return ret;
+}
+
+int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
+				int snapshot_mount)
+{
+	struct nilfs_sb_info *sbi;
+	int ret = 0;
+
+	down_read(&nilfs->ns_sem);
+	if (cno == 0 || cno > nilfs->ns_cno)
+		goto out_unlock;
+
+	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+		if (sbi->s_snapshot_cno == cno &&
+		    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
+					/* exclude read-only mounts */
+			ret++;
+			break;
+		}
+	}
+	/* for protecting recent checkpoints */
+	if (cno >= nilfs_last_cno(nilfs))
+		ret++;
+
+ out_unlock:
+	up_read(&nilfs->ns_sem);
+	return ret;
+}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000000..30fe58778d05
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
+/*
+ * the_nilfs.h - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _THE_NILFS_H
+#define _THE_NILFS_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "sb.h"
+
+/* the_nilfs struct */
+enum {
+	THE_NILFS_INIT = 0,     /* Information from super_block is set */
+	THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
+				   the latest checkpoint was loaded */
+	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
+};
+
+/**
+ * struct the_nilfs - struct to supervise multiple nilfs mount points
+ * @ns_flags: flags
+ * @ns_count: reference count
+ * @ns_bdev: block device
+ * @ns_bdi: backing dev info
+ * @ns_writer: back pointer to writable nilfs_sb_info
+ * @ns_sem: semaphore for shared states
+ * @ns_writer_mutex: mutex protecting ns_writer attach/detach
+ * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_sbh: buffer heads of on-disk super blocks
+ * @ns_sbp: pointers to super block data
+ * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbsize: size of valid data in super block
+ * @ns_supers: list of nilfs super block structs
+ * @ns_seg_seq: segment sequence counter
+ * @ns_segnum: index number of the latest full segment.
+ * @ns_nextnum: index number of the full segment index to be used next
+ * @ns_pseg_offset: offset of next partial segment in the current full segment
+ * @ns_cno: next checkpoint number
+ * @ns_ctime: write time of the last segment
+ * @ns_nongc_ctime: write time of the last segment not for cleaner operation
+ * @ns_ndirtyblks: Number of dirty data blocks
+ * @ns_last_segment_lock: lock protecting fields for the latest segment
+ * @ns_last_pseg: start block number of the latest segment
+ * @ns_last_seq: sequence value of the latest segment
+ * @ns_last_cno: checkpoint number of the latest segment
+ * @ns_prot_seq: least sequence number of segments which must not be reclaimed
+ * @ns_free_segments_count: counter of free segments
+ * @ns_segctor_sem: segment constructor semaphore
+ * @ns_dat: DAT file inode
+ * @ns_cpfile: checkpoint file inode
+ * @ns_sufile: segusage file inode
+ * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
+ * @ns_blocksize_bits: bit length of block size
+ * @ns_nsegments: number of segments in filesystem
+ * @ns_blocks_per_segment: number of blocks per segment
+ * @ns_r_segments_percentage: reserved segments percentage
+ * @ns_nrsvsegs: number of reserved segments
+ * @ns_first_data_block: block number of first data block
+ * @ns_inode_size: size of on-disk inode
+ * @ns_first_ino: first not-special inode number
+ * @ns_crc_seed: seed value of CRC32 calculation
+ */
+struct the_nilfs {
+	unsigned long		ns_flags;
+	atomic_t		ns_count;
+
+	struct block_device    *ns_bdev;
+	struct backing_dev_info *ns_bdi;
+	struct nilfs_sb_info   *ns_writer;
+	struct rw_semaphore	ns_sem;
+	struct mutex		ns_writer_mutex;
+	atomic_t		ns_writer_refcount;
+
+	/*
+	 * used for
+	 * - loading the latest checkpoint exclusively.
+	 * - allocating a new full segment.
+	 * - protecting s_dirt in the super_block struct
+	 *   (see nilfs_write_super) and the following fields.
+	 */
+	struct buffer_head     *ns_sbh[2];
+	struct nilfs_super_block *ns_sbp[2];
+	time_t			ns_sbwtime[2];
+	unsigned		ns_sbsize;
+	unsigned		ns_mount_state;
+	struct list_head	ns_supers;
+
+	/*
+	 * Following fields are dedicated to a writable FS-instance.
+	 * Except for the period seeking checkpoint, code outside the segment
+	 * constructor must lock a segment semaphore while accessing these
+	 * fields.
+	 * The writable FS-instance is sole during a lifetime of the_nilfs.
+	 */
+	u64			ns_seg_seq;
+	__u64			ns_segnum;
+	__u64			ns_nextnum;
+	unsigned long		ns_pseg_offset;
+	__u64			ns_cno;
+	time_t			ns_ctime;
+	time_t			ns_nongc_ctime;
+	atomic_t		ns_ndirtyblks;
+
+	/*
+	 * The following fields hold information on the latest partial segment
+	 * written to disk with a super root.  These fields are protected by
+	 * ns_last_segment_lock.
+	 */
+	spinlock_t		ns_last_segment_lock;
+	sector_t		ns_last_pseg;
+	u64			ns_last_seq;
+	__u64			ns_last_cno;
+	u64			ns_prot_seq;
+	unsigned long		ns_free_segments_count;
+
+	struct rw_semaphore	ns_segctor_sem;
+
+	/*
+	 * Following fields are lock free except for the period before
+	 * the_nilfs is initialized.
+	 */
+	struct inode	       *ns_dat;
+	struct inode	       *ns_cpfile;
+	struct inode	       *ns_sufile;
+	struct inode	       *ns_gc_dat;
+
+	/* GC inode list and hash table head */
+	struct list_head	ns_gc_inodes;
+	struct hlist_head      *ns_gc_inodes_h;
+
+	/* Disk layout information (static) */
+	unsigned int		ns_blocksize_bits;
+	unsigned long		ns_nsegments;
+	unsigned long		ns_blocks_per_segment;
+	unsigned long		ns_r_segments_percentage;
+	unsigned long		ns_nrsvsegs;
+	unsigned long		ns_first_data_block;
+	int			ns_inode_size;
+	int			ns_first_ino;
+	u32			ns_crc_seed;
+};
+
+#define NILFS_GCINODE_HASH_BITS		8
+#define NILFS_GCINODE_HASH_SIZE		(1<<NILFS_GCINODE_HASH_BITS)
+
+#define THE_NILFS_FNS(bit, name)					\
+static inline void set_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline void clear_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline int nilfs_##name(struct the_nilfs *nilfs)			\
+{									\
+	return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);		\
+}
+
+THE_NILFS_FNS(INIT, init)
+THE_NILFS_FNS(LOADED, loaded)
+THE_NILFS_FNS(DISCONTINUED, discontinued)
+
+/* Minimum interval of periodical update of superblocks (in seconds) */
+#define NILFS_SB_FREQ		10
+#define NILFS_ALTSB_FREQ	60  /* spare superblock */
+
+void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
+struct the_nilfs *alloc_nilfs(struct block_device *);
+void put_nilfs(struct the_nilfs *);
+int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
+int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
+int nilfs_near_disk_full(struct the_nilfs *);
+void nilfs_fall_back_super_block(struct the_nilfs *);
+void nilfs_swap_super_block(struct the_nilfs *);
+
+
+static inline void get_nilfs(struct the_nilfs *nilfs)
+{
+	/* Caller must have at least one reference of the_nilfs. */
+	atomic_inc(&nilfs->ns_count);
+}
+
+static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
+{
+	if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
+		mutex_lock(&nilfs->ns_writer_mutex);
+	return nilfs->ns_writer;
+}
+
+static inline void nilfs_put_writer(struct the_nilfs *nilfs)
+{
+	if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
+		mutex_unlock(&nilfs->ns_writer_mutex);
+}
+
+static inline void
+nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+	mutex_lock(&nilfs->ns_writer_mutex);
+	nilfs->ns_writer = sbi;
+	mutex_unlock(&nilfs->ns_writer_mutex);
+}
+
+static inline void
+nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+	mutex_lock(&nilfs->ns_writer_mutex);
+	if (sbi == nilfs->ns_writer)
+		nilfs->ns_writer = NULL;
+	mutex_unlock(&nilfs->ns_writer_mutex);
+}
+
+static inline void
+nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
+			sector_t *seg_start, sector_t *seg_end)
+{
+	*seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
+	*seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
+	if (segnum == 0)
+		*seg_start = nilfs->ns_first_data_block;
+}
+
+static inline sector_t
+nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
+{
+	return (segnum == 0) ? nilfs->ns_first_data_block :
+		(sector_t)nilfs->ns_blocks_per_segment * segnum;
+}
+
+static inline __u64
+nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
+{
+	sector_t segnum = blocknr;
+
+	sector_div(segnum, nilfs->ns_blocks_per_segment);
+	return segnum;
+}
+
+static inline void
+nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
+			sector_t seg_end)
+{
+	/* terminate the current full segment (used in case of I/O-error) */
+	nilfs->ns_pseg_offset = seg_end - seg_start + 1;
+}
+
+static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
+{
+	/* move forward with a full segment */
+	nilfs->ns_segnum = nilfs->ns_nextnum;
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq++;
+}
+
+static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
+{
+	__u64 cno;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	cno = nilfs->ns_last_cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	return cno;
+}
+
+static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
+{
+	return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
+}
+
+#endif /* _THE_NILFS_H */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bed766e435b5..1634319e2404 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -220,7 +220,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
 				rem = 0;
 		}
 
-		kevent->name = kmalloc(len + rem, GFP_KERNEL);
+		kevent->name = kmalloc(len + rem, GFP_NOFS);
 		if (unlikely(!kevent->name)) {
 			kmem_cache_free(event_cachep, kevent);
 			return NULL;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 7d604480557a..b574431a031d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -290,6 +290,21 @@ out_attach:
 	else
 		mlog_errno(ret);
 
+	/*
+	 * In case of error, manually free the allocation and do the iput().
+	 * We need to do this because error here means no d_instantiate(),
+	 * which means iput() will not be called during dput(dentry).
+	 */
+	if (ret < 0 && !alias) {
+		ocfs2_lock_res_free(&dl->dl_lockres);
+		BUG_ON(dl->dl_count != 1);
+		spin_lock(&dentry_attach_lock);
+		dentry->d_fsdata = NULL;
+		spin_unlock(&dentry_attach_lock);
+		kfree(dl);
+		iput(inode);
+	}
+
 	dput(alias);
 
 	return ret;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e71160cda110..c5752305627c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2697,7 +2697,7 @@ static int ocfs2_dx_dir_index_block(struct inode *dir,
 				    u32 *num_dx_entries,
 				    struct buffer_head *dirent_bh)
 {
-	int ret, namelen, i;
+	int ret = 0, namelen, i;
 	char *de_buf, *limit;
 	struct ocfs2_dir_entry *de;
 	struct buffer_head *dx_leaf_bh;
@@ -2934,7 +2934,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 */
 	BUG_ON(alloc > 2);
 
-	ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
+	ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index de3da8eb558c..15713cbb865c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -100,7 +100,8 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 
 	/* If the inode allocator bit is clear, this inode must be stale */
 	if (!set) {
-		mlog(0, "inode %llu suballoc bit is clear\n", blkno);
+		mlog(0, "inode %llu suballoc bit is clear\n",
+		     (unsigned long long)blkno);
 		status = -ESTALE;
 		goto unlock_nfs_sync;
 	}
@@ -114,7 +115,7 @@ check_err:
 	if (status < 0) {
 		if (status == -ESTALE) {
 			mlog(0, "stale inode ino: %llu generation: %u\n",
-			     blkno, handle->ih_generation);
+			     (unsigned long long)blkno, handle->ih_generation);
 		}
 		result = ERR_PTR(status);
 		goto bail;
@@ -129,8 +130,8 @@ check_err:
 check_gen:
 	if (handle->ih_generation != inode->i_generation) {
 		iput(inode);
-		mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
-		     handle->ih_generation);
+		mlog(0, "stale inode ino: %llu generation: %u\n",
+		     (unsigned long long)blkno, handle->ih_generation);
 		result = ERR_PTR(-ESTALE);
 		goto bail;
 	}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8a..c2a87c885b73 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1912,6 +1912,22 @@ out_sems:
 	return written ? written : ret;
 }
 
+static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
+				struct file *out,
+				struct splice_desc *sd)
+{
+	int ret;
+
+	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry,	&sd->pos,
+					    sd->total_len, 0, NULL);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return splice_from_pipe_feed(pipe, sd, pipe_to_file);
+}
+
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       struct file *out,
 				       loff_t *ppos,
@@ -1919,34 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       unsigned int flags)
 {
 	int ret;
-	struct inode *inode = out->f_path.dentry->d_inode;
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
 		   (unsigned int)len,
 		   out->f_path.dentry->d_name.len,
 		   out->f_path.dentry->d_name.name);
 
-	inode_double_lock(inode, pipe->inode);
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
 
-	ret = ocfs2_rw_lock(inode, 1);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
+	splice_from_pipe_begin(&sd);
+	do {
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
 
-	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
-					    NULL);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ret = ocfs2_rw_lock(inode, 1);
+		if (ret < 0)
+			mlog_errno(ret);
+		else {
+			ret = ocfs2_splice_to_file(pipe, out, &sd);
+			ocfs2_rw_unlock(inode, 1);
+		}
+		mutex_unlock(&inode->i_mutex);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, &sd);
 
-	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 
-out_unlock:
-	ocfs2_rw_unlock(inode, 1);
-out:
-	inode_double_unlock(inode, pipe->inode);
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	if (ret > 0) {
+		unsigned long nr_pages;
+
+		*ppos += ret;
+		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+		/*
+		 * If file or inode is SYNC and we actually wrote some data,
+		 * sync it.
+		 */
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int err;
+
+			mutex_lock(&inode->i_mutex);
+			err = ocfs2_rw_lock(inode, 1);
+			if (err < 0) {
+				mlog_errno(err);
+			} else {
+				err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+				ocfs2_rw_unlock(inode, 1);
+			}
+			mutex_unlock(&inode->i_mutex);
+
+			if (err)
+				ret = err;
+		}
+		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+	}
 
 	mlog_exit(ret);
 	return ret;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 619dd7f6c053..eb7b76331eb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -437,8 +437,9 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
 }
 
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor + orphan dir index leaf */
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
+ * inode alloc group descriptor + orphan dir index root +
+ * orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
 
 /* dinode update, old dir dinode update, new dir dinode update, old
  * dir dir entry, new dir dir entry, dir entry update for renaming
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2220f93f668b..33464c6b60a2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1025,10 +1025,8 @@ static int ocfs2_rename(struct inode *old_dir,
 	struct inode *orphan_dir = NULL;
 	struct ocfs2_dinode *newfe = NULL;
 	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-	struct buffer_head *orphan_entry_bh = NULL;
 	struct buffer_head *newfe_bh = NULL;
 	struct buffer_head *old_inode_bh = NULL;
-	struct buffer_head *insert_entry_bh = NULL;
 	struct ocfs2_super *osb = NULL;
 	u64 newfe_blkno, old_de_ino;
 	handle_t *handle = NULL;
@@ -1455,8 +1453,6 @@ bail:
 	brelse(old_inode_bh);
 	brelse(old_dir_bh);
 	brelse(new_dir_bh);
-	brelse(orphan_entry_bh);
-	brelse(insert_entry_bh);
 
 	mlog_exit(status);
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b4ca5911caaf..8439f6b324b9 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2197,26 +2197,29 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
 	struct buffer_head *inode_bh = NULL;
 	struct ocfs2_dinode *inode_fe;
 
-	mlog_entry("blkno: %llu\n", blkno);
+	mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
 
 	/* dirty read disk */
 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
 	if (status < 0) {
-		mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
+		mlog(ML_ERROR, "read block %llu failed %d\n",
+		     (unsigned long long)blkno, status);
 		goto bail;
 	}
 
 	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
-		mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
+		mlog(ML_ERROR, "invalid inode %llu requested\n",
+		     (unsigned long long)blkno);
 		status = -EINVAL;
 		goto bail;
 	}
 
-	if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
+	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
 	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
 		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
-		     blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+		     (unsigned long long)blkno,
+		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
 		status = -EINVAL;
 		goto bail;
 	}
@@ -2251,7 +2254,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 	u64 bg_blkno;
 	int status;
 
-	mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
+	mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
+		   (unsigned int)bit);
 
 	alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
@@ -2266,7 +2270,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 	status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
 					     &group_bh);
 	if (status < 0) {
-		mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
+		mlog(ML_ERROR, "read group %llu failed %d\n",
+		     (unsigned long long)bg_blkno, status);
 		goto bail;
 	}
 
@@ -2300,7 +2305,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 	struct inode *inode_alloc_inode;
 	struct buffer_head *alloc_bh = NULL;
 
-	mlog_entry("blkno: %llu", blkno);
+	mlog_entry("blkno: %llu", (unsigned long long)blkno);
 
 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
 					     &suballoc_bit);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ed0a0cfd68d2..579dd1b1110f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/utsname.h>
+#include <linux/namei.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -54,26 +55,6 @@
 
 #include "buffer_head_io.h"
 
-static char *ocfs2_page_getlink(struct dentry * dentry,
-				struct page **ppage);
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
-					struct buffer_head **bh);
-
-/* get the link contents into pagecache */
-static char *ocfs2_page_getlink(struct dentry * dentry,
-				struct page **ppage)
-{
-	struct page * page;
-	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_mapping_page(mapping, 0, NULL);
-	if (IS_ERR(page))
-		goto sync_fail;
-	*ppage = page;
-	return kmap(page);
-
-sync_fail:
-	return (char*)page;
-}
 
 static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 					struct buffer_head **bh)
@@ -128,40 +109,55 @@ out:
 	return ret;
 }
 
-static void *ocfs2_follow_link(struct dentry *dentry,
-			       struct nameidata *nd)
+static void *ocfs2_fast_follow_link(struct dentry *dentry,
+				    struct nameidata *nd)
 {
-	int status;
-	char *link;
+	int status = 0;
+	int len;
+	char *target, *link = ERR_PTR(-ENOMEM);
 	struct inode *inode = dentry->d_inode;
-	struct page *page = NULL;
 	struct buffer_head *bh = NULL;
-	
-	if (ocfs2_inode_is_fast_symlink(inode))
-		link = ocfs2_fast_symlink_getlink(inode, &bh);
-	else
-		link = ocfs2_page_getlink(dentry, &page);
-	if (IS_ERR(link)) {
-		status = PTR_ERR(link);
+
+	mlog_entry_void();
+
+	BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
+	target = ocfs2_fast_symlink_getlink(inode, &bh);
+	if (IS_ERR(target)) {
+		status = PTR_ERR(target);
 		mlog_errno(status);
 		goto bail;
 	}
 
-	status = vfs_follow_link(nd, link);
+	/* Fast symlinks can't be large */
+	len = strlen(target);
+	link = kzalloc(len + 1, GFP_NOFS);
+	if (!link) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memcpy(link, target, len);
+	nd_set_link(nd, link);
 
 bail:
-	if (page) {
-		kunmap(page);
-		page_cache_release(page);
-	}
 	brelse(bh);
 
-	return ERR_PTR(status);
+	mlog_exit(status);
+	return status ? ERR_PTR(status) : link;
+}
+
+static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	char *link = cookie;
+
+	kfree(link);
 }
 
 const struct inode_operations ocfs2_symlink_inode_operations = {
 	.readlink	= page_readlink,
-	.follow_link	= ocfs2_follow_link,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
 	.setxattr	= generic_setxattr,
@@ -171,7 +167,8 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
 	.readlink	= ocfs2_readlink,
-	.follow_link	= ocfs2_follow_link,
+	.follow_link	= ocfs2_fast_follow_link,
+	.put_link	= ocfs2_fast_put_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
 	.setxattr	= generic_setxattr,
diff --git a/fs/open.c b/fs/open.c
index 377eb25b6abf..bdfbf03615a4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1033,7 +1033,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd_flags(flags);
 		if (fd >= 0) {
-			struct file *f = do_filp_open(dfd, tmp, flags, mode);
+			struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
 			if (IS_ERR(f)) {
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 38e337d51ced..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/blktrace_api.h>
 
 #include "check.h"
 
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
 
 static struct attribute_group *part_attr_groups[] = {
 	&part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	&blk_trace_attr_group,
+#endif
 	NULL
 };
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 4af7aa521813..13414ec45b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -37,6 +37,42 @@
  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
+static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+{
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+}
+
+void pipe_lock(struct pipe_inode_info *pipe)
+{
+	/*
+	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
+	 */
+	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+}
+EXPORT_SYMBOL(pipe_lock);
+
+void pipe_unlock(struct pipe_inode_info *pipe)
+{
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+}
+EXPORT_SYMBOL(pipe_unlock);
+
+void pipe_double_lock(struct pipe_inode_info *pipe1,
+		      struct pipe_inode_info *pipe2)
+{
+	BUG_ON(pipe1 == pipe2);
+
+	if (pipe1 < pipe2) {
+		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+	} else {
+		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+	}
+}
+
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe)
 {
@@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe)
 	 * is considered a noninteractive wait:
 	 */
 	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 	schedule();
 	finish_wait(&pipe->wait, &wait);
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 }
 
 static int
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7e4877d9dcb5..725a650bbbb8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
 #include <linux/tracehook.h>
 
 #include <asm/pgtable.h>
@@ -352,6 +353,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	char state;
 	pid_t ppid = 0, pgid = -1, sid = -1;
 	int num_threads = 0;
+	int permitted;
 	struct mm_struct *mm;
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
@@ -364,11 +366,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ);
 	mm = get_task_mm(task);
 	if (mm) {
 		vsize = task_vsize(mm);
-		eip = KSTK_EIP(task);
-		esp = KSTK_ESP(task);
+		if (permitted) {
+			eip = KSTK_EIP(task);
+			esp = KSTK_ESP(task);
+		}
 	}
 
 	get_task_comm(tcomm, task);
@@ -424,7 +429,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		unlock_task_sighand(task, &flags);
 	}
 
-	if (!whole || num_threads < 2)
+	if (permitted && (!whole || num_threads < 2))
 		wchan = get_wchan(task);
 	if (!whole) {
 		min_flt = task->min_flt;
@@ -476,7 +481,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
-		mm ? mm->start_stack : 0,
+		(permitted && mm) ? mm->start_stack : 0,
 		esp,
 		eip,
 		/* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f71559784bfb..fb45615943c2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -322,7 +322,10 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 	wchan = get_wchan(task);
 
 	if (lookup_symbol_name(wchan, symname) < 0)
-		return sprintf(buffer, "%lu", wchan);
+		if (!ptrace_may_access(task, PTRACE_MODE_READ))
+			return 0;
+		else
+			return sprintf(buffer, "%lu", wchan);
 	else
 		return sprintf(buffer, "%s", symname);
 }
@@ -648,14 +651,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
 	struct proc_mounts *p = file->private_data;
 	struct mnt_namespace *ns = p->ns;
-	unsigned res = 0;
+	unsigned res = POLLIN | POLLRDNORM;
 
 	poll_wait(file, &ns->poll, wait);
 
 	spin_lock(&vfsmount_lock);
 	if (p->event != ns->event) {
 		p->event = ns->event;
-		res = POLLERR;
+		res |= POLLERR | POLLPRI;
 	}
 	spin_unlock(&vfsmount_lock);
 
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 74ea974f5ca6..c6b0302af4c4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	si_meminfo(&i);
 	si_swapinfo(&i);
-	committed = atomic_long_read(&vm_committed_space);
+	committed = percpu_counter_read_positive(&vm_committed_as);
 	allowed = ((totalram_pages - hugetlb_total_pages())
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1e15a2b176e8..b080b791d9e3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -67,8 +67,7 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		sb->s_flags = flags;
 		err = proc_fill_super(sb);
 		if (err) {
-			up_write(&sb->s_umount);
-			deactivate_super(sb);
+			deactivate_locked_super(sb);
 			return err;
 		}
 
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f75efa22df5e..81e4eb60972e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,6 +18,9 @@
 #ifndef arch_irq_stat
 #define arch_irq_stat() 0
 #endif
+#ifndef arch_idle_time
+#define arch_idle_time(cpu) 0
+#endif
 
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v)
 		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
 		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+		idle = cputime64_add(idle, arch_idle_time(i));
 		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
@@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v)
 		nice = kstat_cpu(i).cpustat.nice;
 		system = kstat_cpu(i).cpustat.system;
 		idle = kstat_cpu(i).cpustat.idle;
+		idle = cputime64_add(idle, arch_idle_time(i));
 		iowait = kstat_cpu(i).cpustat.iowait;
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801f..6f61b7cc32e0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 	struct file *file = vma->vm_file;
 	int flags = vma->vm_flags;
 	unsigned long ino = 0;
+	unsigned long long pgoff = 0;
 	dev_t dev = 0;
 	int len;
 
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
+		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
 	}
 
 	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 			flags & VM_WRITE ? 'w' : '-',
 			flags & VM_EXEC ? 'x' : '-',
 			flags & VM_MAYSHARE ? 's' : 'p',
-			((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+			pgoff,
 			MAJOR(dev), MINOR(dev), ino, &len);
 
 	/*
@@ -663,6 +665,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out_task;
 
 	ret = 0;
+
+	if (!count)
+		goto out_task;
+
 	mm = get_task_mm(task);
 	if (!mm)
 		goto out_task;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 863464d5519c..64a72e2e7650 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -126,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	struct file *file;
 	dev_t dev = 0;
 	int flags, len;
+	unsigned long long pgoff = 0;
 
 	flags = vma->vm_flags;
 	file = vma->vm_file;
@@ -134,6 +135,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
+		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 	}
 
 	seq_printf(m,
@@ -144,7 +146,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
+		   pgoff,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 385a0831cc99..68d4f6dc0578 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -1,12 +1,3 @@
-#
-# Makefile for the Linux filesystems.
-#
-# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
-# Rewritten to use lists instead of if-statements.
-#
-
-obj-y :=
-
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a404fb88e456..3a6b193d8444 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	save_mount_options(sb, data);
 
 	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+	sb->s_fs_info = fsi;
 	if (!fsi) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	sb->s_fs_info = fsi;
 
 	err = ramfs_parse_options(data, &fsi->mount_opts);
 	if (err)
 		goto fail;
 
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = RAMFS_MAGIC;
-	sb->s_op = &ramfs_ops;
-	sb->s_time_gran = 1;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= RAMFS_MAGIC;
+	sb->s_op		= &ramfs_ops;
+	sb->s_time_gran		= 1;
+
 	inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
 	if (!inode) {
 		err = -ENOMEM;
@@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	}
 
 	root = d_alloc_root(inode);
+	sb->s_root = root;
 	if (!root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	sb->s_root = root;
+
 	return 0;
 fail:
 	kfree(fsi);
+	sb->s_fs_info = NULL;
 	iput(inode);
 	return err;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 6d5d8ff238aa..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,10 +731,16 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
+static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
+{
+#define HALF_LONG_BITS (BITS_PER_LONG / 2)
+	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
+}
+
 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, u32, pos_high, u32, pos_low)
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct file *file;
 	ssize_t ret = -EBADF;
 	int fput_needed;
@@ -757,9 +763,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 }
 
 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
-		unsigned long, vlen, u32, pos_high, u32, pos_low)
+		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	loff_t pos = pos_from_hilo(pos_h, pos_l);
 	struct file *file;
 	ssize_t ret = -EBADF;
 	int fput_needed;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 67a80d7e59e2..45ee3d357c70 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,6 +41,18 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 
 #define store_ih(where,what) copy_item_head (where, what)
 
+static inline bool is_privroot_deh(struct dentry *dir,
+				   struct reiserfs_de_head *deh)
+{
+	int ret = 0;
+#ifdef CONFIG_REISERFS_FS_XATTR
+	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
+	ret = (dir == dir->d_parent && privroot->d_inode &&
+	       deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+#endif
+	return ret;
+}
+
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 			   filldir_t filldir, loff_t *pos)
 {
@@ -138,18 +150,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 				}
 
 				/* Ignore the .reiserfs_priv entry */
-				if (reiserfs_xattrs(inode->i_sb) &&
-				    !old_format_only(inode->i_sb) &&
-				    dentry == inode->i_sb->s_root &&
-				    REISERFS_SB(inode->i_sb)->priv_root &&
-				    REISERFS_SB(inode->i_sb)->priv_root->d_inode
-				    && deh_objectid(deh) ==
-				    le32_to_cpu(INODE_PKEY
-						(REISERFS_SB(inode->i_sb)->
-						 priv_root->d_inode)->
-						k_objectid)) {
+				if (is_privroot_deh(dentry, deh))
 					continue;
-				}
 
 				d_off = deh_offset(deh);
 				*pos = d_off;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index efd4d720718e..271579128634 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -338,21 +338,8 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
 				&path_to_entry, &de);
 	pathrelse(&path_to_entry);
 	if (retval == NAME_FOUND) {
-		/* Hide the .reiserfs_priv directory */
-		if (reiserfs_xattrs(dir->i_sb) &&
-		    !old_format_only(dir->i_sb) &&
-		    REISERFS_SB(dir->i_sb)->priv_root &&
-		    REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
-		    de.de_objectid ==
-		    le32_to_cpu(INODE_PKEY
-				(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->
-				k_objectid)) {
-			reiserfs_write_unlock(dir->i_sb);
-			return ERR_PTR(-EACCES);
-		}
-
-		inode =
-		    reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+		inode = reiserfs_iget(dir->i_sb,
+				      (struct cpu_key *)&(de.de_dir_id));
 		if (!inode || IS_ERR(inode)) {
 			reiserfs_write_unlock(dir->i_sb);
 			return ERR_PTR(-EACCES);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0ae6486d9046..3567fb9e3fb1 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -448,13 +448,11 @@ int remove_save_link(struct inode *inode, int truncate)
 static void reiserfs_kill_sb(struct super_block *s)
 {
 	if (REISERFS_SB(s)) {
-#ifdef CONFIG_REISERFS_FS_XATTR
 		if (REISERFS_SB(s)->xattr_root) {
 			d_invalidate(REISERFS_SB(s)->xattr_root);
 			dput(REISERFS_SB(s)->xattr_root);
 			REISERFS_SB(s)->xattr_root = NULL;
 		}
-#endif
 		if (REISERFS_SB(s)->priv_root) {
 			d_invalidate(REISERFS_SB(s)->priv_root);
 			dput(REISERFS_SB(s)->priv_root);
@@ -1316,8 +1314,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	}
 
 out_ok:
-	kfree(s->s_options);
-	s->s_options = new_opts;
+	replace_mount_options(s, new_opts);
 	return 0;
 
 out_err:
@@ -1842,7 +1839,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			goto error;
 		}
 
-		if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
 			dput(s->s_root);
 			s->s_root = NULL;
 			goto error;
@@ -1855,7 +1853,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 			reiserfs_info(s, "using 3.5.x disk format\n");
 		}
 
-		if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+		if ((errval = reiserfs_lookup_privroot(s)) ||
+		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
 			dput(s->s_root);
 			s->s_root = NULL;
 			goto error;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f83f52bae390..8e7deb0e6964 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -113,41 +113,30 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 
 #define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
 
-/* Returns and possibly creates the xattr dir. */
-static struct dentry *lookup_or_create_dir(struct dentry *parent,
-					    const char *name, int flags)
+static struct dentry *open_xa_root(struct super_block *sb, int flags)
 {
-	struct dentry *dentry;
-	BUG_ON(!parent);
-
-	dentry = lookup_one_len(name, parent, strlen(name));
-	if (IS_ERR(dentry))
-		return dentry;
-	else if (!dentry->d_inode) {
-		int err = -ENODATA;
+	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
+	struct dentry *xaroot;
+	if (!privroot->d_inode)
+		return ERR_PTR(-ENODATA);
 
-		if (xattr_may_create(flags)) {
-			mutex_lock_nested(&parent->d_inode->i_mutex,
-					  I_MUTEX_XATTR);
-			err = xattr_mkdir(parent->d_inode, dentry, 0700);
-			mutex_unlock(&parent->d_inode->i_mutex);
-		}
+	mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
 
+	xaroot = dget(REISERFS_SB(sb)->xattr_root);
+	if (!xaroot)
+		xaroot = ERR_PTR(-ENODATA);
+	else if (!xaroot->d_inode) {
+		int err = -ENODATA;
+		if (xattr_may_create(flags))
+			err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
 		if (err) {
-			dput(dentry);
-			dentry = ERR_PTR(err);
+			dput(xaroot);
+			xaroot = ERR_PTR(err);
 		}
 	}
 
-	return dentry;
-}
-
-static struct dentry *open_xa_root(struct super_block *sb, int flags)
-{
-	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
-	if (!privroot)
-		return ERR_PTR(-ENODATA);
-	return lookup_or_create_dir(privroot, XAROOT_NAME, flags);
+	mutex_unlock(&privroot->d_inode->i_mutex);
+	return xaroot;
 }
 
 static struct dentry *open_xa_dir(const struct inode *inode, int flags)
@@ -163,10 +152,22 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
 		 inode->i_generation);
 
-	xadir = lookup_or_create_dir(xaroot, namebuf, flags);
+	mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR);
+
+	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
+	if (!IS_ERR(xadir) && !xadir->d_inode) {
+		int err = -ENODATA;
+		if (xattr_may_create(flags))
+			err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
+		if (err) {
+			dput(xadir);
+			xadir = ERR_PTR(err);
+		}
+	}
+
+	mutex_unlock(&xaroot->d_inode->i_mutex);
 	dput(xaroot);
 	return xadir;
-
 }
 
 /* The following are side effects of other operations that aren't explicitly
@@ -184,6 +185,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
 {
 	struct reiserfs_dentry_buf *dbuf = buf;
 	struct dentry *dentry;
+	WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
 
 	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
 		return -ENOSPC;
@@ -349,6 +351,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
 	if (IS_ERR(xadir))
 		return ERR_CAST(xadir);
 
+	mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
 	xafile = lookup_one_len(name, xadir, strlen(name));
 	if (IS_ERR(xafile)) {
 		err = PTR_ERR(xafile);
@@ -360,18 +363,15 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
 
 	if (!xafile->d_inode) {
 		err = -ENODATA;
-		if (xattr_may_create(flags)) {
-			mutex_lock_nested(&xadir->d_inode->i_mutex,
-					  I_MUTEX_XATTR);
+		if (xattr_may_create(flags))
 			err = xattr_create(xadir->d_inode, xafile,
 					      0700|S_IFREG);
-			mutex_unlock(&xadir->d_inode->i_mutex);
-		}
 	}
 
 	if (err)
 		dput(xafile);
 out:
+	mutex_unlock(&xadir->d_inode->i_mutex);
 	dput(xadir);
 	if (err)
 		return ERR_PTR(err);
@@ -435,6 +435,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
 	if (IS_ERR(xadir))
 		return PTR_ERR(xadir);
 
+	mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
 	dentry = lookup_one_len(name, xadir, strlen(name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -442,14 +443,13 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
 	}
 
 	if (dentry->d_inode) {
-		mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
 		err = xattr_unlink(xadir->d_inode, dentry);
-		mutex_unlock(&xadir->d_inode->i_mutex);
 		update_ctime(inode);
 	}
 
 	dput(dentry);
 out_dput:
+	mutex_unlock(&xadir->d_inode->i_mutex);
 	dput(xadir);
 	return err;
 }
@@ -687,20 +687,6 @@ out:
 	return err;
 }
 
-/* Actual operations that are exported to VFS-land */
-struct xattr_handler *reiserfs_xattr_handlers[] = {
-	&reiserfs_xattr_user_handler,
-	&reiserfs_xattr_trusted_handler,
-#ifdef CONFIG_REISERFS_FS_SECURITY
-	&reiserfs_xattr_security_handler,
-#endif
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	&reiserfs_posix_acl_access_handler,
-	&reiserfs_posix_acl_default_handler,
-#endif
-	NULL
-};
-
 /*
  * In order to implement different sets of xattr operations for each xattr
  * prefix with the generic xattr API, a filesystem should create a
@@ -843,7 +829,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	if (!dentry->d_inode)
 		return -EINVAL;
 
-	if (!reiserfs_xattrs(dentry->d_sb) ||
+	if (!dentry->d_sb->s_xattr ||
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
@@ -885,42 +871,50 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
 	return error;
 }
 
-int reiserfs_permission(struct inode *inode, int mask)
-{
-	/*
-	 * We don't do permission checks on the internal objects.
-	 * Permissions are determined by the "owning" object.
-	 */
-	if (IS_PRIVATE(inode))
-		return 0;
-	/*
-	 * Stat data v1 doesn't support ACLs.
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return generic_permission(inode, mask, NULL);
-	else
-		return generic_permission(inode, mask, reiserfs_check_acl);
-}
-
 static int create_privroot(struct dentry *dentry)
 {
 	int err;
 	struct inode *inode = dentry->d_parent->d_inode;
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
+	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+
 	err = xattr_mkdir(inode, dentry, 0700);
-	mutex_unlock(&inode->i_mutex);
-	if (err) {
-		dput(dentry);
-		dentry = NULL;
+	if (err || !dentry->d_inode) {
+		reiserfs_warning(dentry->d_sb, "jdm-20006",
+				 "xattrs/ACLs enabled and couldn't "
+				 "find/create .reiserfs_priv. "
+				 "Failing mount.");
+		return -EOPNOTSUPP;
 	}
 
-	if (dentry && dentry->d_inode)
-		reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
-			      "storage.\n", PRIVROOT_NAME);
+	dentry->d_inode->i_flags |= S_PRIVATE;
+	reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
+		      "storage.\n", PRIVROOT_NAME);
 
-	return err;
+	return 0;
 }
 
+#else
+int __init reiserfs_xattr_register_handlers(void) { return 0; }
+void reiserfs_xattr_unregister_handlers(void) {}
+static int create_privroot(struct dentry *dentry) { return 0; }
+#endif
+
+/* Actual operations that are exported to VFS-land */
+struct xattr_handler *reiserfs_xattr_handlers[] = {
+#ifdef CONFIG_REISERFS_FS_XATTR
+	&reiserfs_xattr_user_handler,
+	&reiserfs_xattr_trusted_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_SECURITY
+	&reiserfs_xattr_security_handler,
+#endif
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
+	&reiserfs_posix_acl_access_handler,
+	&reiserfs_posix_acl_default_handler,
+#endif
+	NULL
+};
+
 static int xattr_mount_check(struct super_block *s)
 {
 	/* We need generation numbers to ensure that the oid mapping is correct
@@ -940,21 +934,33 @@ static int xattr_mount_check(struct super_block *s)
 	return 0;
 }
 
-#else
-int __init reiserfs_xattr_register_handlers(void) { return 0; }
-void reiserfs_xattr_unregister_handlers(void) {}
+int reiserfs_permission(struct inode *inode, int mask)
+{
+	/*
+	 * We don't do permission checks on the internal objects.
+	 * Permissions are determined by the "owning" object.
+	 */
+	if (IS_PRIVATE(inode))
+		return 0;
+
+#ifdef CONFIG_REISERFS_FS_XATTR
+	/*
+	 * Stat data v1 doesn't support ACLs.
+	 */
+	if (get_inode_sd_version(inode) != STAT_DATA_V1)
+		return generic_permission(inode, mask, reiserfs_check_acl);
 #endif
+	return generic_permission(inode, mask, NULL);
+}
 
 /* This will catch lookups from the fs root to .reiserfs_priv */
 static int
 xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 {
 	struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
-	if (name->len == priv_root->d_name.len &&
-	    name->hash == priv_root->d_name.hash &&
-	    !memcmp(name->name, priv_root->d_name.name, name->len)) {
+	if (container_of(q1, struct dentry, d_name) == priv_root)
 		return -ENOENT;
-	} else if (q1->len == name->len &&
+	if (q1->len == name->len &&
 		   !memcmp(q1->name, name->name, name->len))
 		return 0;
 	return 1;
@@ -964,73 +970,71 @@ static const struct dentry_operations xattr_lookup_poison_ops = {
 	.d_compare = xattr_lookup_poison,
 };
 
+int reiserfs_lookup_privroot(struct super_block *s)
+{
+	struct dentry *dentry;
+	int err = 0;
+
+	/* If we don't have the privroot located yet - go find it */
+	mutex_lock(&s->s_root->d_inode->i_mutex);
+	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
+				strlen(PRIVROOT_NAME));
+	if (!IS_ERR(dentry)) {
+		REISERFS_SB(s)->priv_root = dentry;
+		s->s_root->d_op = &xattr_lookup_poison_ops;
+		if (dentry->d_inode)
+			dentry->d_inode->i_flags |= S_PRIVATE;
+	} else
+		err = PTR_ERR(dentry);
+	mutex_unlock(&s->s_root->d_inode->i_mutex);
+
+	return err;
+}
+
 /* We need to take a copy of the mount flags since things like
  * MS_RDONLY don't get set until *after* we're called.
  * mount_flags != mount_options */
 int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 {
 	int err = 0;
+	struct dentry *privroot = REISERFS_SB(s)->priv_root;
 
-#ifdef CONFIG_REISERFS_FS_XATTR
 	err = xattr_mount_check(s);
 	if (err)
 		goto error;
-#endif
 
-	/* If we don't have the privroot located yet - go find it */
-	if (!REISERFS_SB(s)->priv_root) {
-		struct dentry *dentry;
-		dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
-					strlen(PRIVROOT_NAME));
-		if (!IS_ERR(dentry)) {
-#ifdef CONFIG_REISERFS_FS_XATTR
-			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
-				err = create_privroot(dentry);
-#endif
-			if (!dentry->d_inode) {
-				dput(dentry);
-				dentry = NULL;
-			}
-		} else
-			err = PTR_ERR(dentry);
-
-		if (!err && dentry) {
-			s->s_root->d_op = &xattr_lookup_poison_ops;
-			dentry->d_inode->i_flags |= S_PRIVATE;
-			REISERFS_SB(s)->priv_root = dentry;
-#ifdef CONFIG_REISERFS_FS_XATTR
-		/* xattrs are unavailable */
-		} else if (!(mount_flags & MS_RDONLY)) {
-			/* If we're read-only it just means that the dir
-			 * hasn't been created. Not an error -- just no
-			 * xattrs on the fs. We'll check again if we
-			 * go read-write */
-			reiserfs_warning(s, "jdm-20006",
-					 "xattrs/ACLs enabled and couldn't "
-					 "find/create .reiserfs_priv. "
-					 "Failing mount.");
-			err = -EOPNOTSUPP;
-#endif
-		}
+	if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
+		mutex_lock(&s->s_root->d_inode->i_mutex);
+		err = create_privroot(REISERFS_SB(s)->priv_root);
+		mutex_unlock(&s->s_root->d_inode->i_mutex);
 	}
 
-#ifdef CONFIG_REISERFS_FS_XATTR
-	if (!err)
+	if (privroot->d_inode) {
 		s->s_xattr = reiserfs_xattr_handlers;
+		mutex_lock(&privroot->d_inode->i_mutex);
+		if (!REISERFS_SB(s)->xattr_root) {
+			struct dentry *dentry;
+			dentry = lookup_one_len(XAROOT_NAME, privroot,
+						strlen(XAROOT_NAME));
+			if (!IS_ERR(dentry))
+				REISERFS_SB(s)->xattr_root = dentry;
+			else
+				err = PTR_ERR(dentry);
+		}
+		mutex_unlock(&privroot->d_inode->i_mutex);
+	}
 
 error:
 	if (err) {
 		clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
 		clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
 	}
-#endif
 
 	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
-	s->s_flags = s->s_flags & ~MS_POSIXACL;
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 	if (reiserfs_posixacl(s))
 		s->s_flags |= MS_POSIXACL;
-#endif
+	else
+		s->s_flags &= ~MS_POSIXACL;
 
 	return err;
 }
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 4d3c20e787c3..a92c8792c0f6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -55,8 +55,16 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 			   struct reiserfs_security_handle *sec)
 {
 	int blocks = 0;
-	int error = security_inode_init_security(inode, dir, &sec->name,
-						 &sec->value, &sec->length);
+	int error;
+
+	sec->name = NULL;
+
+	/* Don't add selinux attributes on xattrs - they'll never get used */
+	if (IS_PRIVATE(dir))
+		return 0;
+
+	error = security_inode_init_security(inode, dir, &sec->name,
+					     &sec->value, &sec->length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
 			error = 0;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
 config ROMFS_FS
 	tristate "ROM file system support"
-	depends on BLOCK
+	depends on BLOCK || MTD
 	---help---
 	  This is a very small read-only file system mainly intended for
 	  initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
 
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
+
+#
+# Select the backing stores to be supported
+#
+choice
+	prompt "RomFS backing stores"
+	depends on ROMFS_FS
+	default ROMFS_BACKED_BY_BLOCK
+	help
+	  Select the backing stores to be supported.
+
+config ROMFS_BACKED_BY_BLOCK
+	bool "Block device-backed ROM file system support"
+	depends on BLOCK
+	help
+	  This permits ROMFS to use block devices buffered through the page
+	  cache as the medium from which to retrieve data.  It does not allow
+	  direct mapping of the medium.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_MTD
+	bool "MTD-backed ROM file system support"
+	depends on MTD=y || (ROMFS_FS=m && MTD)
+	help
+	  This permits ROMFS to use MTD based devices directly, without the
+	  intercession of the block layer (which may have been disabled).  It
+	  also allows direct mapping of MTD devices through romfs files under
+	  NOMMU conditions if the underlying device is directly addressable by
+	  the CPU.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_BOTH
+	bool "Both the above"
+	depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+
+
+config ROMFS_ON_BLOCK
+	bool
+	default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+
+config ROMFS_ON_MTD
+	bool
+	default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
 #
-# Makefile for the linux romfs filesystem routines.
+# Makefile for the linux RomFS filesystem routines.
 #
 
 obj-$(CONFIG_ROMFS_FS) += romfs.o
 
-romfs-objs := inode.o
+romfs-y := storage.o super.o
+
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
+
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * ROMFS file system, Linux implementation
- *
- * Copyright (C) 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
- *
- * Using parts of the minix filesystem
- * Copyright (C) 1991, 1992  Linus Torvalds
- *
- * and parts of the affs filesystem additionally
- * Copyright (C) 1993  Ray Burr
- * Copyright (C) 1996  Hans-Joachim Widmaier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes
- *					Changed for 2.1.19 modules
- *	Jan 1997			Initial release
- *	Jun 1997			2.1.43+ changes
- *					Proper page locking in readpage
- *					Changed to work with 2.1.45+ fs
- *	Jul 1997			Fixed follow_link
- *			2.1.47
- *					lookup shouldn't return -ENOENT
- *					from Horst von Brand:
- *					  fail on wrong checksum
- *					  double unlock_super was possible
- *					  correct namelen for statfs
- *					spotted by Bill Hawes:
- *					  readlink shouldn't iput()
- *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
- *					  exposed a problem in readdir
- *			2.1.107		code-freeze spellchecker run
- *	Aug 1998			2.1.118+ VFS changes
- *	Sep 1998	2.1.122		another VFS change (follow_link)
- *	Apr 1999	2.2.7		no more EBADF checking in
- *					  lookup/readdir, use ERR_PTR
- *	Jun 1999	2.3.6		d_alloc_root use changed
- *			2.3.9		clean up usage of ENOENT/negative
- *					  dentries in lookup
- *					clean up page flags setting
- *					  (error, uptodate, locking) in
- *					  in readpage
- *					use init_special_inode for
- *					  fifos/sockets (and streamline) in
- *					  read_inode, fix _ops table order
- *	Aug 1999	2.3.16		__initfunc() => __init change
- *	Oct 1999	2.3.24		page->owner hack obsoleted
- *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
- */
-
-/* todo:
- *	- see Documentation/filesystems/romfs.txt
- *	- use allocated, not stack memory for file names?
- *	- considering write access...
- *	- network (tftp) files?
- *	- merge back some _op tables
- */
-
-/*
- * Sorry about some optimizations and for some goto's.  I just wanted
- * to squeeze some more bytes out of this code.. :)
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/romfs_fs.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-
-#include <asm/uaccess.h>
-
-struct romfs_inode_info {
-	unsigned long i_metasize;	/* size of non-data area */
-	unsigned long i_dataoffset;	/* from the start of fs */
-	struct inode vfs_inode;
-};
-
-static struct inode *romfs_iget(struct super_block *, unsigned long);
-
-/* instead of private superblock data */
-static inline unsigned long romfs_maxsize(struct super_block *sb)
-{
-	return (unsigned long)sb->s_fs_info;
-}
-
-static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
-{
-	return container_of(inode, struct romfs_inode_info, vfs_inode);
-}
-
-static __u32
-romfs_checksum(void *data, int size)
-{
-	__u32 sum;
-	__be32 *ptr;
-
-	sum = 0; ptr = data;
-	size>>=2;
-	while (size>0) {
-		sum += be32_to_cpu(*ptr++);
-		size--;
-	}
-	return sum;
-}
-
-static const struct super_operations romfs_ops;
-
-static int romfs_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct buffer_head *bh;
-	struct romfs_super_block *rsb;
-	struct inode *root;
-	int sz, ret = -EINVAL;
-
-	/* I would parse the options here, but there are none.. :) */
-
-	sb_set_blocksize(s, ROMBSIZE);
-	s->s_maxbytes = 0xFFFFFFFF;
-
-	bh = sb_bread(s, 0);
-	if (!bh) {
-		/* XXX merge with other printk? */
-                printk ("romfs: unable to read superblock\n");
-		goto outnobh;
-	}
-
-	rsb = (struct romfs_super_block *)bh->b_data;
-	sz = be32_to_cpu(rsb->size);
-	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
-	   || sz < ROMFH_SIZE) {
-		if (!silent)
-			printk ("VFS: Can't find a romfs filesystem on dev "
-				"%s.\n", s->s_id);
-		goto out;
-	}
-	if (romfs_checksum(rsb, min_t(int, sz, 512))) {
-		printk ("romfs: bad initial checksum on dev "
-			"%s.\n", s->s_id);
-		goto out;
-	}
-
-	s->s_magic = ROMFS_MAGIC;
-	s->s_fs_info = (void *)(long)sz;
-
-	s->s_flags |= MS_RDONLY;
-
-	/* Find the start of the fs */
-	sz = (ROMFH_SIZE +
-	      strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
-	     & ROMFH_MASK;
-
-	s->s_op	= &romfs_ops;
-	root = romfs_iget(s, sz);
-	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
-	}
-
-	ret = -ENOMEM;
-	s->s_root = d_alloc_root(root);
-	if (!s->s_root)
-		goto outiput;
-
-	brelse(bh);
-	return 0;
-
-outiput:
-	iput(root);
-out:
-	brelse(bh);
-outnobh:
-	return ret;
-}
-
-/* That's simple too. */
-
-static int
-romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	buf->f_type = ROMFS_MAGIC;
-	buf->f_bsize = ROMBSIZE;
-	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
-	buf->f_namelen = ROMFS_MAXFN;
-	return 0;
-}
-
-/* some helper routines */
-
-static int
-romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
-{
-	struct buffer_head *bh;
-	unsigned long avail, maxsize, res;
-
-	maxsize = romfs_maxsize(i->i_sb);
-	if (offset >= maxsize)
-		return -1;
-
-	/* strnlen is almost always valid */
-	if (count > maxsize || offset+count > maxsize)
-		count = maxsize-offset;
-
-	bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-	if (!bh)
-		return -1;		/* error */
-
-	avail = ROMBSIZE - (offset & ROMBMASK);
-	maxsize = min_t(unsigned long, count, avail);
-	res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
-	brelse(bh);
-
-	if (res < maxsize)
-		return res;		/* found all of it */
-
-	while (res < count) {
-		offset += maxsize;
-
-		bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-		if (!bh)
-			return -1;
-		maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-		avail = strnlen(bh->b_data, maxsize);
-		res += avail;
-		brelse(bh);
-		if (avail < maxsize)
-			return res;
-	}
-	return res;
-}
-
-static int
-romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
-{
-	struct buffer_head *bh;
-	unsigned long avail, maxsize, res;
-
-	maxsize = romfs_maxsize(i->i_sb);
-	if (offset >= maxsize || count > maxsize || offset+count>maxsize)
-		return -1;
-
-	bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-	if (!bh)
-		return -1;		/* error */
-
-	avail = ROMBSIZE - (offset & ROMBMASK);
-	maxsize = min_t(unsigned long, count, avail);
-	memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
-	brelse(bh);
-
-	res = maxsize;			/* all of it */
-
-	while (res < count) {
-		offset += maxsize;
-		dest += maxsize;
-
-		bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-		if (!bh)
-			return -1;
-		maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-		memcpy(dest, bh->b_data, maxsize);
-		brelse(bh);
-		res += maxsize;
-	}
-	return res;
-}
-
-static unsigned char romfs_dtype_table[] = {
-	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
-};
-
-static int
-romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct inode *i = filp->f_path.dentry->d_inode;
-	struct romfs_inode ri;
-	unsigned long offset, maxoff;
-	int j, ino, nextfh;
-	int stored = 0;
-	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
-
-	lock_kernel();
-
-	maxoff = romfs_maxsize(i->i_sb);
-
-	offset = filp->f_pos;
-	if (!offset) {
-		offset = i->i_ino & ROMFH_MASK;
-		if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-			goto out;
-		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-	}
-
-	/* Not really failsafe, but we are read-only... */
-	for(;;) {
-		if (!offset || offset >= maxoff) {
-			offset = maxoff;
-			filp->f_pos = offset;
-			goto out;
-		}
-		filp->f_pos = offset;
-
-		/* Fetch inode info */
-		if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-			goto out;
-
-		j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
-		if (j < 0)
-			goto out;
-
-		fsname[j]=0;
-		romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
-
-		ino = offset;
-		nextfh = be32_to_cpu(ri.next);
-		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
-			ino = be32_to_cpu(ri.spec);
-		if (filldir(dirent, fsname, j, offset, ino,
-			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
-			goto out;
-		}
-		stored++;
-		offset = nextfh & ROMFH_MASK;
-	}
-out:
-	unlock_kernel();
-	return stored;
-}
-
-static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	unsigned long offset, maxoff;
-	long res;
-	int fslen;
-	struct inode *inode = NULL;
-	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
-	struct romfs_inode ri;
-	const char *name;		/* got from dentry */
-	int len;
-
-	res = -EACCES;			/* placeholder for "no data here" */
-	offset = dir->i_ino & ROMFH_MASK;
-	lock_kernel();
-	if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-		goto error;
-
-	maxoff = romfs_maxsize(dir->i_sb);
-	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
-	/* OK, now find the file whose name is in "dentry" in the
-	 * directory specified by "dir".  */
-
-	name = dentry->d_name.name;
-	len = dentry->d_name.len;
-
-	for(;;) {
-		if (!offset || offset >= maxoff)
-			goto success; /* negative success */
-		if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-			goto error;
-
-		/* try to match the first 16 bytes of name */
-		fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
-		if (len < ROMFH_SIZE) {
-			if (len == fslen) {
-				/* both are shorter, and same size */
-				romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-				if (strncmp (name, fsname, len) == 0)
-					break;
-			}
-		} else if (fslen >= ROMFH_SIZE) {
-			/* both are longer; XXX optimize max size */
-			fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
-			if (len == fslen) {
-				romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-				if (strncmp(name, fsname, len) == 0)
-					break;
-			}
-		}
-		/* next entry */
-		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
-	}
-
-	/* Hard link handling */
-	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
-		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
-	inode = romfs_iget(dir->i_sb, offset);
-	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto error;
-	}
-
-success:
-	d_add(dentry, inode);
-	res = 0;
-error:
-	unlock_kernel();
-	return ERR_PTR(res);
-}
-
-/*
- * Ok, we do readpage, to be able to execute programs.  Unfortunately,
- * we can't use bmap, since we may have looser alignments.
- */
-
-static int
-romfs_readpage(struct file *file, struct page * page)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t offset, size;
-	unsigned long filled;
-	void *buf;
-	int result = -EIO;
-
-	page_cache_get(page);
-	lock_kernel();
-	buf = kmap(page);
-	if (!buf)
-		goto err_out;
-
-	/* 32 bit warning -- but not for us :) */
-	offset = page_offset(page);
-	size = i_size_read(inode);
-	filled = 0;
-	result = 0;
-	if (offset < size) {
-		unsigned long readlen;
-
-		size -= offset;
-		readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-
-		filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
-
-		if (filled != readlen) {
-			SetPageError(page);
-			filled = 0;
-			result = -EIO;
-		}
-	}
-
-	if (filled < PAGE_SIZE)
-		memset(buf + filled, 0, PAGE_SIZE-filled);
-
-	if (!result)
-		SetPageUptodate(page);
-	flush_dcache_page(page);
-
-	unlock_page(page);
-
-	kunmap(page);
-err_out:
-	page_cache_release(page);
-	unlock_kernel();
-
-	return result;
-}
-
-/* Mapping from our types to the kernel */
-
-static const struct address_space_operations romfs_aops = {
-	.readpage = romfs_readpage
-};
-
-static const struct file_operations romfs_dir_operations = {
-	.read		= generic_read_dir,
-	.readdir	= romfs_readdir,
-};
-
-static const struct inode_operations romfs_dir_inode_operations = {
-	.lookup		= romfs_lookup,
-};
-
-static mode_t romfs_modemap[] =
-{
-	0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
-	S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
-};
-
-static struct inode *
-romfs_iget(struct super_block *sb, unsigned long ino)
-{
-	int nextfh, ret;
-	struct romfs_inode ri;
-	struct inode *i;
-
-	ino &= ROMFH_MASK;
-	i = iget_locked(sb, ino);
-	if (!i)
-		return ERR_PTR(-ENOMEM);
-	if (!(i->i_state & I_NEW))
-		return i;
-
-	i->i_mode = 0;
-
-	/* Loop for finding the real hard link */
-	for(;;) {
-		if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
-			printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
-				ino);
-			iget_failed(i);
-			return ERR_PTR(-EIO);
-		}
-		/* XXX: do romfs_checksum here too (with name) */
-
-		nextfh = be32_to_cpu(ri.next);
-		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
-			break;
-
-		ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
-	}
-
-	i->i_nlink = 1;		/* Hard to decide.. */
-	i->i_size = be32_to_cpu(ri.size);
-	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
-	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-
-        /* Precalculate the data offset */
-	ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-	if (ret >= 0)
-		ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-	else
-		ino = 0;
-
-        ROMFS_I(i)->i_metasize = ino;
-        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-
-        /* Compute permissions */
-        ino = romfs_modemap[nextfh & ROMFH_TYPE];
-	/* only "normal" files have ops */
-	switch (nextfh & ROMFH_TYPE) {
-		case 1:
-			i->i_size = ROMFS_I(i)->i_metasize;
-			i->i_op = &romfs_dir_inode_operations;
-			i->i_fop = &romfs_dir_operations;
-			if (nextfh & ROMFH_EXEC)
-				ino |= S_IXUGO;
-			i->i_mode = ino;
-			break;
-		case 2:
-			i->i_fop = &generic_ro_fops;
-			i->i_data.a_ops = &romfs_aops;
-			if (nextfh & ROMFH_EXEC)
-				ino |= S_IXUGO;
-			i->i_mode = ino;
-			break;
-		case 3:
-			i->i_op = &page_symlink_inode_operations;
-			i->i_data.a_ops = &romfs_aops;
-			i->i_mode = ino | S_IRWXUGO;
-			break;
-		default:
-			/* depending on MBZ for sock/fifos */
-			nextfh = be32_to_cpu(ri.spec);
-			init_special_inode(i, ino,
-					MKDEV(nextfh>>16,nextfh&0xffff));
-	}
-	unlock_new_inode(i);
-	return i;
-}
-
-static struct kmem_cache * romfs_inode_cachep;
-
-static struct inode *romfs_alloc_inode(struct super_block *sb)
-{
-	struct romfs_inode_info *ei;
-	ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	return &ei->vfs_inode;
-}
-
-static void romfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct romfs_inode_info *ei = foo;
-
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int init_inodecache(void)
-{
-	romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
-					     sizeof(struct romfs_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once);
-	if (romfs_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	kmem_cache_destroy(romfs_inode_cachep);
-}
-
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_RDONLY;
-	return 0;
-}
-
-static const struct super_operations romfs_ops = {
-	.alloc_inode	= romfs_alloc_inode,
-	.destroy_inode	= romfs_destroy_inode,
-	.statfs		= romfs_statfs,
-	.remount_fs	= romfs_remount,
-};
-
-static int romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
-			   mnt);
-}
-
-static struct file_system_type romfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "romfs",
-	.get_sb		= romfs_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_romfs_fs(void)
-{
-	int err = init_inodecache();
-	if (err)
-		goto out1;
-        err = register_filesystem(&romfs_fs_type);
-	if (err)
-		goto out;
-	return 0;
-out:
-	destroy_inodecache();
-out1:
-	return err;
-}
-
-static void __exit exit_romfs_fs(void)
-{
-	unregister_filesystem(&romfs_fs_type);
-	destroy_inodecache();
-}
-
-/* Yes, works even as a module... :) */
-
-module_init(init_romfs_fs)
-module_exit(exit_romfs_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..95217b830118
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/romfs_fs.h>
+
+struct romfs_inode_info {
+	struct inode	vfs_inode;
+	unsigned long	i_metasize;	/* size of non-data area */
+	unsigned long	i_dataoffset;	/* from the start of fs */
+};
+
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+	return (size_t) (unsigned long) sb->s_fs_info;
+}
+
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+	return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops	generic_ro_fops
+#endif
+
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct mtd_info *mtd = inode->i_sb->s_mtd;
+	unsigned long isize, offset;
+
+	if (!mtd)
+		goto cant_map_directly;
+
+	isize = i_size_read(inode);
+	offset = pgoff << PAGE_SHIFT;
+	if (offset > isize || len > isize || offset > isize - len)
+		return (unsigned long) -EINVAL;
+
+	/* we need to call down to the MTD layer to do the actual mapping */
+	if (mtd->get_unmapped_area) {
+		if (addr != 0)
+			return (unsigned long) -EINVAL;
+
+		if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+			return (unsigned long) -EINVAL;
+
+		offset += ROMFS_I(inode)->i_dataoffset;
+		if (offset > mtd->size - len)
+			return (unsigned long) -EINVAL;
+
+		return mtd->get_unmapped_area(mtd, len, offset, flags);
+	}
+
+cant_map_directly:
+	return (unsigned long) -ENOSYS;
+}
+
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+const struct file_operations romfs_ro_fops = {
+	.llseek			= generic_file_llseek,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.splice_read		= generic_file_splice_read,
+	.mmap			= romfs_mmap,
+	.get_unmapped_area	= romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..b3208adf8e71
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,293 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	size_t rlen;
+	int ret;
+
+	ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+	return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen)
+{
+	ssize_t n = 0;
+	size_t segment;
+	u_char buf[16], *p;
+	size_t len;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time */
+	while (maxlen > 0) {
+		segment = min_t(size_t, maxlen, 16);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		p = memchr(buf, 0, len);
+		if (p)
+			return n + (p - buf);
+		maxlen -= len;
+		pos += len;
+		n += len;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
+{
+	u_char buf[17];
+	size_t len, segment;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time, and attempt to grab the
+	 * trailing NUL whilst we're at it */
+	buf[0] = 0xff;
+
+	while (size > 0) {
+		segment = min_t(size_t, size + 1, 17);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		len--;
+		if (memcmp(buf, str, len) != 0)
+			return 0;
+		buf[0] = buf[len];
+		size -= len;
+		pos += len;
+		str += len;
+	}
+
+	/* check the trailing NUL was */
+	if (buf[0])
+		return 0;
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+
+	/* copy the string up to blocksize bytes at a time */
+	while (buflen > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, buflen, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		memcpy(buf, bh->b_data + offset, segment);
+		brelse(bh);
+		buf += segment;
+		buflen -= segment;
+		pos += segment;
+	}
+
+	return 0;
+}
+
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t limit)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ssize_t n = 0;
+	size_t segment;
+	u_char *buf, *p;
+
+	/* scan the string up to blocksize bytes at a time */
+	while (limit > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, limit, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		buf = bh->b_data + offset;
+		p = memchr(buf, 0, segment);
+		brelse(bh);
+		if (p)
+			return n + (p - buf);
+		limit -= segment;
+		pos += segment;
+		n += segment;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
+			    const char *str, size_t size)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+	bool matched, terminated = false;
+
+	/* compare string up to a block at a time */
+	while (size > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, size, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		matched = (memcmp(bh->b_data + offset, str, segment) == 0);
+
+		size -= segment;
+		pos += segment;
+		str += segment;
+		if (matched && size == 0 && offset + segment < ROMBSIZE) {
+			if (!bh->b_data[offset + segment])
+				terminated = true;
+			else
+				matched = false;
+		}
+		brelse(bh);
+		if (!matched)
+			return 0;
+	}
+
+	if (!terminated) {
+		/* the terminating NUL must be on the first byte of the next
+		 * block */
+		BUG_ON((pos & (ROMBSIZE - 1)) != 0);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		matched = !bh->b_data[0];
+		brelse(bh);
+		if (!matched)
+			return 0;
+	}
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+		   void *buf, size_t buflen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (buflen > limit - pos)
+		buflen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+	return -EIO;
+}
+
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+			  unsigned long pos, size_t maxlen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (maxlen > limit - pos)
+		maxlen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strnlen(sb, pos, limit);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strnlen(sb, pos, limit);
+#endif
+	return -EIO;
+}
+
+/*
+ * compare a string to one in romfs
+ * - the string to be compared to, str, may not be NUL-terminated; instead the
+ *   string is of the specified size
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
+		     const char *str, size_t size)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (size > ROMFS_MAXFN)
+		return -ENAMETOOLONG;
+	if (size + 1 > limit - pos)
+		return -EIO;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strcmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strcmp(sb, pos, str, size);
+#endif
+	return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..4ab3c03d8f95
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,654 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *					Changed for 2.1.19 modules
+ *	Jan 1997			Initial release
+ *	Jun 1997			2.1.43+ changes
+ *					Proper page locking in readpage
+ *					Changed to work with 2.1.45+ fs
+ *	Jul 1997			Fixed follow_link
+ *			2.1.47
+ *					lookup shouldn't return -ENOENT
+ *					from Horst von Brand:
+ *					  fail on wrong checksum
+ *					  double unlock_super was possible
+ *					  correct namelen for statfs
+ *					spotted by Bill Hawes:
+ *					  readlink shouldn't iput()
+ *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
+ *					  exposed a problem in readdir
+ *			2.1.107		code-freeze spellchecker run
+ *	Aug 1998			2.1.118+ VFS changes
+ *	Sep 1998	2.1.122		another VFS change (follow_link)
+ *	Apr 1999	2.2.7		no more EBADF checking in
+ *					  lookup/readdir, use ERR_PTR
+ *	Jun 1999	2.3.6		d_alloc_root use changed
+ *			2.3.9		clean up usage of ENOENT/negative
+ *					  dentries in lookup
+ *					clean up page flags setting
+ *					  (error, uptodate, locking) in
+ *					  in readpage
+ *					use init_special_inode for
+ *					  fifos/sockets (and streamline) in
+ *					  read_inode, fix _ops table order
+ *	Aug 1999	2.3.16		__initfunc() => __init change
+ *	Oct 1999	2.3.24		page->owner hack obsoleted
+ *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+static struct kmem_cache *romfs_inode_cachep;
+
+static const umode_t romfs_modemap[8] = {
+	0,			/* hard link */
+	S_IFDIR  | 0644,	/* directory */
+	S_IFREG  | 0644,	/* regular file */
+	S_IFLNK  | 0777,	/* symlink */
+	S_IFBLK  | 0600,	/* blockdev */
+	S_IFCHR  | 0600,	/* chardev */
+	S_IFSOCK | 0644,	/* socket */
+	S_IFIFO  | 0644		/* FIFO */
+};
+
+static const unsigned char romfs_dtype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t offset, size;
+	unsigned long fillsize, pos;
+	void *buf;
+	int ret;
+
+	buf = kmap(page);
+	if (!buf)
+		return -ENOMEM;
+
+	/* 32 bit warning -- but not for us :) */
+	offset = page_offset(page);
+	size = i_size_read(inode);
+	fillsize = 0;
+	ret = 0;
+	if (offset < size) {
+		size -= offset;
+		fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+		pos = ROMFS_I(inode)->i_dataoffset + offset;
+
+		ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+		if (ret < 0) {
+			SetPageError(page);
+			fillsize = 0;
+			ret = -EIO;
+		}
+	}
+
+	if (fillsize < PAGE_SIZE)
+		memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+	if (ret == 0)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+	kunmap(page);
+	unlock_page(page);
+	return ret;
+}
+
+static const struct address_space_operations romfs_aops = {
+	.readpage	= romfs_readpage
+};
+
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *i = filp->f_dentry->d_inode;
+	struct romfs_inode ri;
+	unsigned long offset, maxoff;
+	int j, ino, nextfh;
+	int stored = 0;
+	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
+	int ret;
+
+	maxoff = romfs_maxsize(i->i_sb);
+
+	offset = filp->f_pos;
+	if (!offset) {
+		offset = i->i_ino & ROMFH_MASK;
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* Not really failsafe, but we are read-only... */
+	for (;;) {
+		if (!offset || offset >= maxoff) {
+			offset = maxoff;
+			filp->f_pos = offset;
+			goto out;
+		}
+		filp->f_pos = offset;
+
+		/* Fetch inode info */
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+
+		j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+				      sizeof(fsname) - 1);
+		if (j < 0)
+			goto out;
+
+		ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+		if (ret < 0)
+			goto out;
+		fsname[j] = '\0';
+
+		ino = offset;
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+			ino = be32_to_cpu(ri.spec);
+		if (filldir(dirent, fsname, j, offset, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+			goto out;
+
+		stored++;
+		offset = nextfh & ROMFH_MASK;
+	}
+
+out:
+	return stored;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	unsigned long offset, maxoff;
+	struct inode *inode;
+	struct romfs_inode ri;
+	const char *name;		/* got from dentry */
+	int len, ret;
+
+	offset = dir->i_ino & ROMFH_MASK;
+	ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+	if (ret < 0)
+		goto error;
+
+	/* search all the file entries in the list starting from the one
+	 * pointed to by the directory's special data */
+	maxoff = romfs_maxsize(dir->i_sb);
+	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	name = dentry->d_name.name;
+	len = dentry->d_name.len;
+
+	for (;;) {
+		if (!offset || offset >= maxoff)
+			goto out0;
+
+		ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* try to match the first 16 bytes of name */
+		ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
+				       len);
+		if (ret < 0)
+			goto error;
+		if (ret == 1)
+			break;
+
+		/* next entry */
+		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+	}
+
+	/* Hard link handling */
+	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	inode = romfs_iget(dir->i_sb, offset);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto error;
+	}
+	goto outi;
+
+	/*
+	 * it's a bit funky, _lookup needs to return an error code
+	 * (negative) or a NULL, both as a dentry.  ENOENT should not
+	 * be returned, instead we need to create a negative dentry by
+	 * d_add(dentry, NULL); and return 0 as no error.
+	 * (Although as I see, it only matters on writable file
+	 * systems).
+	 */
+out0:
+	inode = NULL;
+outi:
+	d_add(dentry, inode);
+	ret = 0;
+error:
+	return ERR_PTR(ret);
+}
+
+static const struct file_operations romfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= romfs_readdir,
+};
+
+static struct inode_operations romfs_dir_inode_operations = {
+	.lookup		= romfs_lookup,
+};
+
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+	struct romfs_inode_info *inode;
+	struct romfs_inode ri;
+	struct inode *i;
+	unsigned long nlen;
+	unsigned nextfh;
+	int ret;
+	umode_t mode;
+
+	/* we might have to traverse a chain of "hard link" file entries to get
+	 * to the actual file */
+	for (;;) {
+		ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* XXX: do romfs_checksum here too (with name) */
+
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+			break;
+
+		pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* determine the length of the filename */
+	nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+	if (IS_ERR_VALUE(nlen))
+		goto eio;
+
+	/* get an inode for this image position */
+	i = iget_locked(sb, pos);
+	if (!i)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(i->i_state & I_NEW))
+		return i;
+
+	/* precalculate the data offset */
+	inode = ROMFS_I(i);
+	inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+	inode->i_dataoffset = pos + inode->i_metasize;
+
+	i->i_nlink = 1;		/* Hard to decide.. */
+	i->i_size = be32_to_cpu(ri.size);
+	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+
+	/* set up mode and ops */
+	mode = romfs_modemap[nextfh & ROMFH_TYPE];
+
+	switch (nextfh & ROMFH_TYPE) {
+	case ROMFH_DIR:
+		i->i_size = ROMFS_I(i)->i_metasize;
+		i->i_op = &romfs_dir_inode_operations;
+		i->i_fop = &romfs_dir_operations;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_REG:
+		i->i_fop = &romfs_ro_fops;
+		i->i_data.a_ops = &romfs_aops;
+		if (i->i_sb->s_mtd)
+			i->i_data.backing_dev_info =
+				i->i_sb->s_mtd->backing_dev_info;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_SYM:
+		i->i_op = &page_symlink_inode_operations;
+		i->i_data.a_ops = &romfs_aops;
+		mode |= S_IRWXUGO;
+		break;
+	default:
+		/* depending on MBZ for sock/fifos */
+		nextfh = be32_to_cpu(ri.spec);
+		init_special_inode(i, mode, MKDEV(nextfh >> 16,
+						  nextfh & 0xffff));
+		break;
+	}
+
+	i->i_mode = mode;
+
+	unlock_new_inode(i);
+	return i;
+
+eio:
+	ret = -EIO;
+error:
+	printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+	return ERR_PTR(ret);
+}
+
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+	struct romfs_inode_info *inode;
+	inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+	return inode ? &inode->vfs_inode : NULL;
+}
+
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = ROMFS_MAGIC;
+	buf->f_namelen = ROMFS_MAXFN;
+	buf->f_bsize = ROMBSIZE;
+	buf->f_bfree = buf->f_bavail = buf->f_ffree;
+	buf->f_blocks =
+		(romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	return 0;
+}
+
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations romfs_super_ops = {
+	.alloc_inode	= romfs_alloc_inode,
+	.destroy_inode	= romfs_destroy_inode,
+	.statfs		= romfs_statfs,
+	.remount_fs	= romfs_remount,
+};
+
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+	const __be32 *ptr = data;
+	__u32 sum;
+
+	sum = 0;
+	size >>= 2;
+	while (size > 0) {
+		sum += be32_to_cpu(*ptr++);
+		size--;
+	}
+	return sum;
+}
+
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct romfs_super_block *rsb;
+	struct inode *root;
+	unsigned long pos, img_size;
+	const char *storage;
+	size_t len;
+	int ret;
+
+#ifdef CONFIG_BLOCK
+	if (!sb->s_mtd) {
+		sb_set_blocksize(sb, ROMBSIZE);
+	} else {
+		sb->s_blocksize = ROMBSIZE;
+		sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+	}
+#endif
+
+	sb->s_maxbytes = 0xFFFFFFFF;
+	sb->s_magic = ROMFS_MAGIC;
+	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_op = &romfs_super_ops;
+
+	/* read the image superblock and check it */
+	rsb = kmalloc(512, GFP_KERNEL);
+	if (!rsb)
+		return -ENOMEM;
+
+	sb->s_fs_info = (void *) 512;
+	ret = romfs_dev_read(sb, 0, rsb, 512);
+	if (ret < 0)
+		goto error_rsb;
+
+	img_size = be32_to_cpu(rsb->size);
+
+	if (sb->s_mtd && img_size > sb->s_mtd->size)
+		goto error_rsb_inval;
+
+	sb->s_fs_info = (void *) img_size;
+
+	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+	    img_size < ROMFH_SIZE) {
+		if (!silent)
+			printk(KERN_WARNING "VFS:"
+			       " Can't find a romfs filesystem on dev %s.\n",
+			       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+		printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+		       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	storage = sb->s_mtd ? "MTD" : "the block layer";
+
+	len = strnlen(rsb->name, ROMFS_MAXFN);
+	if (!silent)
+		printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+		       (unsigned) len, (unsigned) len, rsb->name, storage);
+
+	kfree(rsb);
+	rsb = NULL;
+
+	/* find the root directory */
+	pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+
+	root = romfs_iget(sb, pos);
+	if (!root)
+		goto error;
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root)
+		goto error_i;
+
+	return 0;
+
+error_i:
+	iput(root);
+error:
+	return -EINVAL;
+error_rsb_inval:
+	ret = -EINVAL;
+error_rsb:
+	return ret;
+}
+
+/*
+ * get a superblock for mounting
+ */
+static int romfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	int ret = -EINVAL;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+			 mnt);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (ret == -EINVAL)
+		ret = get_sb_bdev(fs_type, flags, dev_name, data,
+				  romfs_fill_super, mnt);
+#endif
+	return ret;
+}
+
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd) {
+		kill_mtd_super(sb);
+		return;
+	}
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev) {
+		kill_block_super(sb);
+		return;
+	}
+#endif
+}
+
+static struct file_system_type romfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "romfs",
+	.get_sb		= romfs_get_sb,
+	.kill_sb	= romfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+	struct romfs_inode_info *inode = _inode;
+
+	inode_init_once(&inode->vfs_inode);
+}
+
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+	int ret;
+
+	printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+
+	romfs_inode_cachep =
+		kmem_cache_create("romfs_i",
+				  sizeof(struct romfs_inode_info), 0,
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				  romfs_i_init_once);
+
+	if (!romfs_inode_cachep) {
+		printk(KERN_ERR
+		       "ROMFS error: Failed to initialise inode cache\n");
+		return -ENOMEM;
+	}
+	ret = register_filesystem(&romfs_fs_type);
+	if (ret) {
+		printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+		goto error_register;
+	}
+	return 0;
+
+error_register:
+	kmem_cache_destroy(romfs_inode_cachep);
+	return ret;
+}
+
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+	unregister_filesystem(&romfs_fs_type);
+	kmem_cache_destroy(romfs_inode_cachep);
+}
+
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index dd727d43e5b7..666953d59a35 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -182,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 	do_wakeup = 0;
 	page_nr = 0;
 
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	for (;;) {
 		if (!pipe->readers) {
@@ -245,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		pipe->waiting_writers--;
 	}
 
-	if (pipe->inode) {
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
-		if (do_wakeup) {
-			smp_mb();
-			if (waitqueue_active(&pipe->wait))
-				wake_up_interruptible(&pipe->wait);
-			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-		}
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
 	while (page_nr < spd_pages)
@@ -555,8 +552,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
  * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
  * a new page in the output file page cache and fill/dirty that.
  */
-static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-			struct splice_desc *sd)
+int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+		 struct splice_desc *sd)
 {
 	struct file *file = sd->u.file;
 	struct address_space *mapping = file->f_mapping;
@@ -600,108 +597,177 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 out:
 	return ret;
 }
+EXPORT_SYMBOL(pipe_to_file);
+
+static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
+{
+	smp_mb();
+	if (waitqueue_active(&pipe->wait))
+		wake_up_interruptible(&pipe->wait);
+	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+}
 
 /**
- * __splice_from_pipe - splice data from a pipe to given actor
+ * splice_from_pipe_feed - feed available data from a pipe to a file
  * @pipe:	pipe to splice from
  * @sd:		information to @actor
  * @actor:	handler that splices the data
  *
  * Description:
- *    This function does little more than loop over the pipe and call
- *    @actor to do the actual moving of a single struct pipe_buffer to
- *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
- *    pipe_to_user.
+ *    This function loops over the pipe and calls @actor to do the
+ *    actual moving of a single struct pipe_buffer to the desired
+ *    destination.  It returns when there's no more buffers left in
+ *    the pipe or if the requested number of bytes (@sd->total_len)
+ *    have been copied.  It returns a positive number (one) if the
+ *    pipe needs to be filled with more data, zero if the required
+ *    number of bytes have been copied and -errno on error.
  *
+ *    This, together with splice_from_pipe_{begin,end,next}, may be
+ *    used to implement the functionality of __splice_from_pipe() when
+ *    locking is required around copying the pipe buffers to the
+ *    destination.
  */
-ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
-			   splice_actor *actor)
+int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			  splice_actor *actor)
 {
-	int ret, do_wakeup, err;
-
-	ret = 0;
-	do_wakeup = 0;
-
-	for (;;) {
-		if (pipe->nrbufs) {
-			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
-			const struct pipe_buf_operations *ops = buf->ops;
+	int ret;
 
-			sd->len = buf->len;
-			if (sd->len > sd->total_len)
-				sd->len = sd->total_len;
+	while (pipe->nrbufs) {
+		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+		const struct pipe_buf_operations *ops = buf->ops;
 
-			err = actor(pipe, buf, sd);
-			if (err <= 0) {
-				if (!ret && err != -ENODATA)
-					ret = err;
+		sd->len = buf->len;
+		if (sd->len > sd->total_len)
+			sd->len = sd->total_len;
 
-				break;
-			}
+		ret = actor(pipe, buf, sd);
+		if (ret <= 0) {
+			if (ret == -ENODATA)
+				ret = 0;
+			return ret;
+		}
+		buf->offset += ret;
+		buf->len -= ret;
 
-			ret += err;
-			buf->offset += err;
-			buf->len -= err;
+		sd->num_spliced += ret;
+		sd->len -= ret;
+		sd->pos += ret;
+		sd->total_len -= ret;
 
-			sd->len -= err;
-			sd->pos += err;
-			sd->total_len -= err;
-			if (sd->len)
-				continue;
+		if (!buf->len) {
+			buf->ops = NULL;
+			ops->release(pipe, buf);
+			pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+			pipe->nrbufs--;
+			if (pipe->inode)
+				sd->need_wakeup = true;
+		}
 
-			if (!buf->len) {
-				buf->ops = NULL;
-				ops->release(pipe, buf);
-				pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
-				pipe->nrbufs--;
-				if (pipe->inode)
-					do_wakeup = 1;
-			}
+		if (!sd->total_len)
+			return 0;
+	}
 
-			if (!sd->total_len)
-				break;
-		}
+	return 1;
+}
+EXPORT_SYMBOL(splice_from_pipe_feed);
 
-		if (pipe->nrbufs)
-			continue;
+/**
+ * splice_from_pipe_next - wait for some data to splice from
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wait for some data and return a positive
+ *    value (one) if pipe buffers are available.  It will return zero
+ *    or -errno if no more data needs to be spliced.
+ */
+int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	while (!pipe->nrbufs) {
 		if (!pipe->writers)
-			break;
-		if (!pipe->waiting_writers) {
-			if (ret)
-				break;
-		}
+			return 0;
 
-		if (sd->flags & SPLICE_F_NONBLOCK) {
-			if (!ret)
-				ret = -EAGAIN;
-			break;
-		}
+		if (!pipe->waiting_writers && sd->num_spliced)
+			return 0;
 
-		if (signal_pending(current)) {
-			if (!ret)
-				ret = -ERESTARTSYS;
-			break;
-		}
+		if (sd->flags & SPLICE_F_NONBLOCK)
+			return -EAGAIN;
 
-		if (do_wakeup) {
-			smp_mb();
-			if (waitqueue_active(&pipe->wait))
-				wake_up_interruptible_sync(&pipe->wait);
-			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-			do_wakeup = 0;
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+
+		if (sd->need_wakeup) {
+			wakeup_pipe_writers(pipe);
+			sd->need_wakeup = false;
 		}
 
 		pipe_wait(pipe);
 	}
 
-	if (do_wakeup) {
-		smp_mb();
-		if (waitqueue_active(&pipe->wait))
-			wake_up_interruptible(&pipe->wait);
-		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-	}
+	return 1;
+}
+EXPORT_SYMBOL(splice_from_pipe_next);
 
-	return ret;
+/**
+ * splice_from_pipe_begin - start splicing from pipe
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function should be called before a loop containing
+ *    splice_from_pipe_next() and splice_from_pipe_feed() to
+ *    initialize the necessary fields of @sd.
+ */
+void splice_from_pipe_begin(struct splice_desc *sd)
+{
+	sd->num_spliced = 0;
+	sd->need_wakeup = false;
+}
+EXPORT_SYMBOL(splice_from_pipe_begin);
+
+/**
+ * splice_from_pipe_end - finish splicing from pipe
+ * @pipe:	pipe to splice from
+ * @sd:		information about the splice operation
+ *
+ * Description:
+ *    This function will wake up pipe writers if necessary.  It should
+ *    be called after a loop containing splice_from_pipe_next() and
+ *    splice_from_pipe_feed().
+ */
+void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	if (sd->need_wakeup)
+		wakeup_pipe_writers(pipe);
+}
+EXPORT_SYMBOL(splice_from_pipe_end);
+
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe:	pipe to splice from
+ * @sd:		information to @actor
+ * @actor:	handler that splices the data
+ *
+ * Description:
+ *    This function does little more than loop over the pipe and call
+ *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+			   splice_actor *actor)
+{
+	int ret;
+
+	splice_from_pipe_begin(sd);
+	do {
+		ret = splice_from_pipe_next(pipe, sd);
+		if (ret > 0)
+			ret = splice_from_pipe_feed(pipe, sd, actor);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, sd);
+
+	return sd->num_spliced ? sd->num_spliced : ret;
 }
 EXPORT_SYMBOL(__splice_from_pipe);
 
@@ -715,7 +781,7 @@ EXPORT_SYMBOL(__splice_from_pipe);
  * @actor:	handler that splices the data
  *
  * Description:
- *    See __splice_from_pipe. This function locks the input and output inodes,
+ *    See __splice_from_pipe. This function locks the pipe inode,
  *    otherwise it's identical to __splice_from_pipe().
  *
  */
@@ -724,7 +790,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 			 splice_actor *actor)
 {
 	ssize_t ret;
-	struct inode *inode = out->f_mapping->host;
 	struct splice_desc sd = {
 		.total_len = len,
 		.flags = flags,
@@ -732,21 +797,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 		.u.file = out,
 	};
 
-	/*
-	 * The actor worker might be calling ->write_begin and
-	 * ->write_end. Most of the time, these expect i_mutex to
-	 * be held. Since this may result in an ABBA deadlock with
-	 * pipe->inode, we have to order lock acquiry here.
-	 */
-	inode_double_lock(inode, pipe->inode);
+	pipe_lock(pipe);
 	ret = __splice_from_pipe(pipe, &sd, actor);
-	inode_double_unlock(inode, pipe->inode);
+	pipe_unlock(pipe);
 
 	return ret;
 }
 
 /**
- * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
+ * generic_file_splice_write - splice data from a pipe to a file
  * @pipe:	pipe info
  * @out:	file to write to
  * @ppos:	position in @out
@@ -755,13 +814,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
  *
  * Description:
  *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file. The caller is responsible
- *    for acquiring i_mutex on both inodes.
+ *    the given pipe inode to the given file.
  *
  */
 ssize_t
-generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
-				 loff_t *ppos, size_t len, unsigned int flags)
+generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
 	struct inode *inode = mapping->host;
@@ -772,70 +830,28 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
 		.u.file = out,
 	};
 	ssize_t ret;
-	int err;
-
-	err = file_remove_suid(out);
-	if (unlikely(err))
-		return err;
-
-	ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-	if (ret > 0) {
-		unsigned long nr_pages;
 
-		*ppos += ret;
-		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	pipe_lock(pipe);
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-
-			if (err)
-				ret = err;
-		}
-		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
-	}
+	splice_from_pipe_begin(&sd);
+	do {
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
 
-	return ret;
-}
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ret = file_remove_suid(out);
+		if (!ret)
+			ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+		mutex_unlock(&inode->i_mutex);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, &sd);
 
-EXPORT_SYMBOL(generic_file_splice_write_nolock);
+	pipe_unlock(pipe);
 
-/**
- * generic_file_splice_write - splice data from a pipe to a file
- * @pipe:	pipe info
- * @out:	file to write to
- * @ppos:	position in @out
- * @len:	number of bytes to splice
- * @flags:	splice modifier flags
- *
- * Description:
- *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file.
- *
- */
-ssize_t
-generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-			  loff_t *ppos, size_t len, unsigned int flags)
-{
-	struct address_space *mapping = out->f_mapping;
-	struct inode *inode = mapping->host;
-	struct splice_desc sd = {
-		.total_len = len,
-		.flags = flags,
-		.pos = *ppos,
-		.u.file = out,
-	};
-	ssize_t ret;
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
 
-	inode_double_lock(inode, pipe->inode);
-	ret = file_remove_suid(out);
-	if (likely(!ret))
-		ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-	inode_double_unlock(inode, pipe->inode);
 	if (ret > 0) {
 		unsigned long nr_pages;
 
@@ -1324,8 +1340,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
 	if (!pipe)
 		return -EBADF;
 
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	error = ret = 0;
 	while (nr_segs) {
@@ -1380,8 +1395,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
 		iov++;
 	}
 
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
 	if (!ret)
 		ret = error;
@@ -1509,7 +1523,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		return 0;
 
 	ret = 0;
-	mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	while (!pipe->nrbufs) {
 		if (signal_pending(current)) {
@@ -1527,7 +1541,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		pipe_wait(pipe);
 	}
 
-	mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 	return ret;
 }
 
@@ -1547,7 +1561,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		return 0;
 
 	ret = 0;
-	mutex_lock(&pipe->inode->i_mutex);
+	pipe_lock(pipe);
 
 	while (pipe->nrbufs >= PIPE_BUFFERS) {
 		if (!pipe->readers) {
@@ -1568,7 +1582,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 		pipe->waiting_writers--;
 	}
 
-	mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 	return ret;
 }
 
@@ -1584,10 +1598,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 
 	/*
 	 * Potential ABBA deadlock, work around it by ordering lock
-	 * grabbing by inode address. Otherwise two different processes
+	 * grabbing by pipe info address. Otherwise two different processes
 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
 	 */
-	inode_double_lock(ipipe->inode, opipe->inode);
+	pipe_double_lock(ipipe, opipe);
 
 	do {
 		if (!opipe->readers) {
@@ -1638,7 +1652,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
 		ret = -EAGAIN;
 
-	inode_double_unlock(ipipe->inode, opipe->inode);
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
 
 	/*
 	 * If we put data in the output pipe, wakeup any potential readers.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 8258cf9a0317..70e3244fa30f 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,4 +5,3 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o
-#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1c4739e33af6..40c98fa6b5d6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -252,6 +252,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 	cache->entries = entries;
 	cache->block_size = block_size;
 	cache->pages = block_size >> PAGE_CACHE_SHIFT;
+	cache->pages = cache->pages ? cache->pages : 1;
 	cache->name = name;
 	cache->num_waiters = 0;
 	spin_lock_init(&cache->lock);
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/zlib.h>
+#include <linux/slab.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index ffa6edcd2d0c..0adc624c956f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -157,6 +157,16 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
 		goto failed_mount;
 
+	/*
+	 * Check the system page size is not larger than the filesystem
+	 * block size (by default 128K).  This is currently not supported.
+	 */
+	if (PAGE_CACHE_SIZE > msblk->block_size) {
+		ERROR("Page size > filesystem block size (%d).  This is "
+			"currently not supported!\n", msblk->block_size);
+		goto failed_mount;
+	}
+
 	msblk->block_log = le16_to_cpu(sblk->block_log);
 	if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
 		goto failed_mount;
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..075694e31d8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,59 +55,54 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_getattr);
 
-int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
+int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-	struct path path;
-	int error;
+	struct file *f = fget(fd);
+	int error = -EBADF;
 
-	error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
-	if (!error) {
-		error = vfs_getattr(path.mnt, path.dentry, stat);
-		path_put(&path);
+	if (f) {
+		error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
+		fput(f);
 	}
 	return error;
 }
+EXPORT_SYMBOL(vfs_fstat);
 
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
 {
-	return vfs_stat_fd(AT_FDCWD, name, stat);
-}
+	struct path path;
+	int error = -EINVAL;
+	int lookup_flags = 0;
 
-EXPORT_SYMBOL(vfs_stat);
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
 
-int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
-{
-	struct path path;
-	int error;
+	if (!(flag & AT_SYMLINK_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
 
-	error = user_path_at(dfd, name, 0, &path);
-	if (!error) {
-		error = vfs_getattr(path.mnt, path.dentry, stat);
-		path_put(&path);
-	}
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	error = vfs_getattr(path.mnt, path.dentry, stat);
+	path_put(&path);
+out:
 	return error;
 }
+EXPORT_SYMBOL(vfs_fstatat);
 
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_stat(char __user *name, struct kstat *stat)
 {
-	return vfs_lstat_fd(AT_FDCWD, name, stat);
+	return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
+EXPORT_SYMBOL(vfs_stat);
 
-EXPORT_SYMBOL(vfs_lstat);
-
-int vfs_fstat(unsigned int fd, struct kstat *stat)
+int vfs_lstat(char __user *name, struct kstat *stat)
 {
-	struct file *f = fget(fd);
-	int error = -EBADF;
-
-	if (f) {
-		error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-		fput(f);
-	}
-	return error;
+	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
 }
+EXPORT_SYMBOL(vfs_lstat);
 
-EXPORT_SYMBOL(vfs_fstat);
 
 #ifdef __ARCH_WANT_OLD_STAT
 
@@ -155,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_old_stat(&stat, statbuf);
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_old_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_old_stat(&stat, statbuf);
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_old_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -240,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	int error = vfs_stat(filename, &stat);
 
-	return error;
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
 }
 
 SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+	int error;
 
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
 
-	return error;
+	return cp_new_stat(&stat, statbuf);
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
@@ -264,21 +261,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat(&stat, statbuf);
 }
 #endif
 
@@ -404,21 +392,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
 		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
-	int error = -EINVAL;
-
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-		goto out;
-
-	if (flag & AT_SYMLINK_NOFOLLOW)
-		error = vfs_lstat_fd(dfd, filename, &stat);
-	else
-		error = vfs_stat_fd(dfd, filename, &stat);
-
-	if (!error)
-		error = cp_new_stat64(&stat, statbuf);
+	int error;
 
-out:
-	return error;
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_new_stat64(&stat, statbuf);
 }
 #endif /* __ARCH_WANT_STAT64 */
 
diff --git a/fs/super.c b/fs/super.c
index 77cb4ec919b9..1943fdf655fa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -208,6 +208,34 @@ void deactivate_super(struct super_block *s)
 EXPORT_SYMBOL(deactivate_super);
 
 /**
+ *	deactivate_locked_super	-	drop an active reference to superblock
+ *	@s: superblock to deactivate
+ *
+ *	Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
+ *	it does not unlock it until it's all over.  As the result, it's safe to
+ *	use to dispose of new superblock on ->get_sb() failure exits - nobody
+ *	will see the sucker until it's all over.  Equivalent using up_write +
+ *	deactivate_super is safe for that purpose only if superblock is either
+ *	safe to use or has NULL ->s_root when we unlock.
+ */
+void deactivate_locked_super(struct super_block *s)
+{
+	struct file_system_type *fs = s->s_type;
+	if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+		s->s_count -= S_BIAS-1;
+		spin_unlock(&sb_lock);
+		vfs_dq_off(s, 0);
+		fs->kill_sb(s);
+		put_filesystem(fs);
+		put_super(s);
+	} else {
+		up_write(&s->s_umount);
+	}
+}
+
+EXPORT_SYMBOL(deactivate_locked_super);
+
+/**
  *	grab_super - acquire an active reference
  *	@s: reference we are trying to make active
  *
@@ -771,6 +799,45 @@ void kill_litter_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_litter_super);
 
+static int ns_test_super(struct super_block *sb, void *data)
+{
+	return sb->s_fs_info == data;
+}
+
+static int ns_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct super_block *sb;
+
+	sb = sget(fs_type, ns_test_super, ns_set_super, data);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (!sb->s_root) {
+		int err;
+		sb->s_flags = flags;
+		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err) {
+			deactivate_locked_super(sb);
+			return err;
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	simple_set_mnt(mnt, sb);
+	return 0;
+}
+
+EXPORT_SYMBOL(get_sb_ns);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
@@ -814,8 +881,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			error = -EBUSY;
 			goto error_bdev;
 		}
@@ -830,8 +896,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			goto error;
 		}
 
@@ -857,7 +922,7 @@ void kill_block_super(struct super_block *sb)
 	struct block_device *bdev = sb->s_bdev;
 	fmode_t mode = sb->s_mode;
 
-	bdev->bd_super = 0;
+	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
 	close_bdev_exclusive(bdev, mode);
@@ -881,8 +946,7 @@ int get_sb_nodev(struct file_system_type *fs_type,
 
 	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
+		deactivate_locked_super(s);
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
@@ -912,8 +976,7 @@ int get_sb_single(struct file_system_type *fs_type,
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
+			deactivate_locked_super(s);
 			return error;
 		}
 		s->s_flags |= MS_ACTIVE;
@@ -966,8 +1029,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	return mnt;
 out_sb:
 	dput(mnt->mnt_root);
-	up_write(&mnt->mnt_sb->s_umount);
-	deactivate_super(mnt->mnt_sb);
+	deactivate_locked_super(mnt->mnt_sb);
 out_free_secdata:
 	free_secdata(secdata);
 out_mnt:
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 93e0c0281d45..9345806c8853 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 			count = size - offs;
 	}
 
-	temp = kmalloc(count, GFP_KERNEL);
-	if (!temp)
-		return -ENOMEM;
-
-	if (copy_from_user(temp, userbuf, count)) {
-		count = -EFAULT;
-		goto out_free;
-	}
+	temp = memdup_user(userbuf, count);
+	if (IS_ERR(temp))
+		return PTR_ERR(temp);
 
 	mutex_lock(&bb->mutex);
 
@@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 	if (count > 0)
 		*off = offs + count;
 
-out_free:
-	kfree(temp);
 	return count;
 }
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 289c43a47263..b1606e07b7a3 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 	if (buffer->event != atomic_read(&od->event))
 		goto trigger;
 
-	return 0;
+	return DEFAULT_POLLMASK;
 
  trigger:
 	buffer->needs_read_fill = 1;
-	return POLLERR|POLLPRI;
+	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
 void sysfs_notify_dirent(struct sysfs_dirent *sd)
@@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct {
 	struct work_struct	work;
 };
 
+static struct workqueue_struct *sysfs_workqueue;
 static DEFINE_MUTEX(sysfs_workq_mutex);
 static LIST_HEAD(sysfs_workq);
 static void sysfs_schedule_callback_work(struct work_struct *work)
@@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 	mutex_lock(&sysfs_workq_mutex);
 	list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
 		if (ss->kobj == kobj) {
+			module_put(owner);
 			mutex_unlock(&sysfs_workq_mutex);
 			return -EAGAIN;
 		}
 	mutex_unlock(&sysfs_workq_mutex);
 
+	if (sysfs_workqueue == NULL) {
+		sysfs_workqueue = create_workqueue("sysfsd");
+		if (sysfs_workqueue == NULL) {
+			module_put(owner);
+			return -ENOMEM;
+		}
+	}
+
 	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
 	if (!ss) {
 		module_put(owner);
@@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 	mutex_lock(&sysfs_workq_mutex);
 	list_add_tail(&ss->workq_list, &sysfs_workq);
 	mutex_unlock(&sysfs_workq_mutex);
-	schedule_work(&ss->work);
+	queue_work(sysfs_workqueue, &ss->work);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
 }
 
 /**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
  * @c: UBIFS file-system description object
  *
- * This function calculates and returns the number of eraseblocks which should
- * be kept for index usage.
+ * This function calculates and returns the number of LEBs which should be kept
+ * for index usage.
  */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-	int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+	int idx_lebs;
 	long long idx_size;
 
 	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-
 	/* And make sure we have thrice the index size of space reserved */
-	idx_size = idx_size + (idx_size << 1);
-
+	idx_size += idx_size << 1;
 	/*
 	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
 	 * pair, nor similarly the two variables for the new index size, so we
 	 * have to do this costly 64-bit division on fast-path.
 	 */
-	idx_size += eff_leb_size - 1;
-	idx_lebs = div_u64(idx_size, eff_leb_size);
+	idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
 	/*
 	 * The index head is not available for the in-the-gaps method, so add an
 	 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
  * do_budget_space - reserve flash space for index and data growth.
  * @c: UBIFS file-system description object
  *
- * This function makes sure UBIFS has enough free eraseblocks for index growth
- * and data.
+ * This function makes sure UBIFS has enough free LEBs for index growth and
+ * data.
  *
  * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
  * would take if it was consolidated and written to the flash. This guarantees
  * that the "in-the-gaps" commit method always succeeds and UBIFS will always
  * be able to commit dirty index. So this function basically adds amount of
  * budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
  *
  * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
  * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
  *    be large, because UBIFS does not do any index consolidation as long as
  *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
  *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
- *   consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
  *
  * This function returns zero in case of success, and %-ENOSPC in case of
  * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * This function calculates amount of free space to report to user-space.
  *
  * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
- * amount of free flash space it has (well, because not all dirty space is
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectations about what free space is. Users seem to
- * accustomed to assume that if the file-system reports N bytes of free space,
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
+ * free flash space it has (well, because not all dirty space is reclaimable,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
+ * bread user expectations about what free space is. Users seem to accustomed
+ * to assume that if the file-system reports N bytes of free space, they would
+ * be able to fit a file of N bytes to the FS. This almost works for
  * traditional file-systems, because they have way less overhead than UBIFS.
  * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 					  "bad or corrupted node)");
 		else {
 			for (i = 0; i < nlen && dent->name[i]; i++)
-				printk("%c", dent->name[i]);
+				printk(KERN_CONT "%c", dent->name[i]);
 		}
-		printk("\n");
+		printk(KERN_CONT "\n");
 
 		break;
 	}
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
 
 			/*
 			 * Make sure the last key in our znode is less or
-			 * equivalent than the the key in zbranch which goes
+			 * equivalent than the key in the zbranch which goes
 			 * after our pointing zbranch.
 			 */
 			cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int skipped_read = 0;
 	struct page *page;
 
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		/* The page is not loaded from the flash */
-		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
 			/*
 			 * We change whole page so no need to load it. But we
 			 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			 * the media.
 			 */
 			SetPageChecked(page);
-		else {
+			skipped_read = 1;
+		} else {
 			err = do_readpage(page);
 			if (err) {
 				unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(err)) {
 		ubifs_assert(err == -ENOSPC);
 		/*
+		 * If we skipped reading the page because we were going to
+		 * write all of it, then it is not up to date.
+		 */
+		if (skipped_read) {
+			ClearPageChecked(page);
+			ClearPageUptodate(page);
+		}
+		/*
 		 * Budgeting failed which means it would have to force
 		 * write-back but didn't, because we set the @fast flag in the
 		 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
  * whole index and correct all inode sizes, which is long an unacceptable.
  *
  * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
  * written to the flash media last time. Otherwise, UBIFS forces inode
  * write-back, thus making sure the on-flash inode contains current inode size,
  * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * ubifs_find_free_space - find a data LEB with free space.
  * @c: the UBIFS file-system description object
  * @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
  * @squeeze: whether to try to find space in a non-empty LEB first
  *
  * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * failed to find a LEB with @min_space bytes of free space and other a negative
  * error codes in case of failure.
  */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 			  int squeeze)
 {
 	const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 		spin_unlock(&c->space_lock);
 	}
 
-	*free = lprops->free;
+	*offs = c->leb_size - lprops->free;
 	ubifs_release_lprops(c);
 
-	if (*free == c->leb_size) {
+	if (*offs == 0) {
 		/*
 		 * Ensure that empty LEBs have been unmapped. They may not have
 		 * been, for example, because of an unclean unmount.  Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 			return err;
 	}
 
-	dbg_find("found LEB %d, free %d", lnum, *free);
-	ubifs_assert(*free >= min_space);
+	dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
+	ubifs_assert(*offs <= c->leb_size - min_space);
 	return lnum;
 
 out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
  * have to waste large pieces of free space at the end of LEB B, because nodes
  * from LEB A would not fit. And the worst situation is when all nodes are of
  * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
  * be unable to reclaim it. So, LEBs with free + dirty greater than dark
  * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
  * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-
-/*
  * GC may need to move more than one LEB to make progress. The below constants
  * define "soft" and "hard" limits on the number of LEBs the garbage collector
  * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 
 /**
- * joinup - bring data nodes for an inode together.
- * @c: UBIFS file-system description object
- * @sleb: describes scanned LEB
- * @inum: inode number
- * @blk: block number
- * @data: list to which to add data nodes
+ * list_sort - sort a list.
+ * @priv: private data, passed to @cmp
+ * @head: the list to sort
+ * @cmp: the elements comparison function
  *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
- * them to @data if they are data nodes from @inum and have a larger block
- * number than @blk. This function returns %0 on success and a negative error
- * code on failure.
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
+ * in ascending order.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
  */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
-		  unsigned int blk, struct list_head *data)
+static void list_sort(void *priv, struct list_head *head,
+		      int (*cmp)(void *priv, struct list_head *a,
+				 struct list_head *b))
 {
-	int err, cnt = 6, lnum = sleb->lnum, offs;
-	struct ubifs_scan_node *snod, *tmp;
-	union ubifs_key *key;
+	struct list_head *p, *q, *e, *list, *tail, *oldhead;
+	int insize, nmerges, psize, qsize, i;
+
+	if (list_empty(head))
+		return;
+
+	list = head->next;
+	list_del(head);
+	insize = 1;
+	for (;;) {
+		p = oldhead = list;
+		list = tail = NULL;
+		nmerges = 0;
+
+		while (p) {
+			nmerges++;
+			q = p;
+			psize = 0;
+			for (i = 0; i < insize; i++) {
+				psize++;
+				q = q->next == oldhead ? NULL : q->next;
+				if (!q)
+					break;
+			}
 
-	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		key = &snod->key;
-		if (key_inum(c, key) == inum &&
-		    key_type(c, key) == UBIFS_DATA_KEY &&
-		    key_block(c, key) > blk) {
-			offs = snod->offs;
-			err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
-			if (err < 0)
-				return err;
-			list_del(&snod->list);
-			if (err) {
-				list_add_tail(&snod->list, data);
-				blk = key_block(c, key);
-			} else
-				kfree(snod);
-			cnt = 6;
-		} else if (--cnt == 0)
+			qsize = insize;
+			while (psize > 0 || (qsize > 0 && q)) {
+				if (!psize) {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				} else if (!qsize || !q) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else if (cmp(priv, p, q) <= 0) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				}
+				if (tail)
+					tail->next = e;
+				else
+					list = e;
+				e->prev = tail;
+				tail = e;
+			}
+			p = q;
+		}
+
+		tail->next = list;
+		list->prev = tail;
+
+		if (nmerges <= 1)
 			break;
+
+		insize *= 2;
 	}
-	return 0;
+
+	head->next = list;
+	head->prev = list->prev;
+	list->prev->next = head;
+	list->prev = head;
 }
 
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		unsigned int blka = key_block(c, &sa->key);
+		unsigned int blkb = key_block(c, &sb->key);
+
+		if (blka <= blkb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	int typea, typeb;
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	typea = key_type(c, &sa->key);
+	typeb = key_type(c, &sb->key);
+	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+
+	/* Inodes go before directory entries */
+	if (typea == UBIFS_INO_KEY) {
+		if (typeb == UBIFS_INO_KEY)
+			return sb->len - sa->len;
+		return -1;
+	}
+	if (typeb == UBIFS_INO_KEY)
+		return 1;
+
+	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		uint32_t hasha = key_hash(c, &sa->key);
+		uint32_t hashb = key_hash(c, &sb->key);
+
+		if (hasha <= hashb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * sort_nodes - sort nodes for GC.
  * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
  *
- * This function moves valid nodes from data LEB described by @sleb to the GC
- * journal head. The obsolete nodes are dropped.
+ * This function sorts the list of inodes to garbage collect. First of all, it
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
  *
- * When moving nodes we have to deal with classical bin-packing problem: the
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
- * where the nodes in the @sleb->nodes list are the elements which should be
- * fit optimally to the bins. This function uses the "first fit decreasing"
- * strategy, although it does not really sort the nodes but just split them on
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ * Non-data nodes are sorted as follows.
+ *   o First go inode nodes - they are sorted in descending length order.
+ *   o Then go directory entry nodes - they are sorted in hash order, which
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
+ *     inode number go before direntry nodes with higher parent inode number,
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
  *
- * This function returns zero in case of success, %-EAGAIN if commit is
- * required, and other negative error codes in case of other failures.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
  */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		      struct list_head *nondata, int *min)
 {
 	struct ubifs_scan_node *snod, *tmp;
-	struct list_head data, large, medium, small;
-	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-	int avail, err, min = INT_MAX;
-	unsigned int blk = 0;
-	ino_t inum = 0;
 
-	INIT_LIST_HEAD(&data);
-	INIT_LIST_HEAD(&large);
-	INIT_LIST_HEAD(&medium);
-	INIT_LIST_HEAD(&small);
+	*min = INT_MAX;
 
-	while (!list_empty(&sleb->nodes)) {
-		struct list_head *lst = sleb->nodes.next;
-
-		snod = list_entry(lst, struct ubifs_scan_node, list);
+	/* Separate data nodes and non-data nodes */
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		int err;
 
 		ubifs_assert(snod->type != UBIFS_IDX_NODE);
 		ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
 					 snod->offs, 0);
 		if (err < 0)
-			goto out;
+			return err;
 
-		list_del(lst);
 		if (!err) {
 			/* The node is obsolete, remove it from the list */
+			list_del(&snod->list);
 			kfree(snod);
 			continue;
 		}
 
-		/*
-		 * Sort the list of nodes so that data nodes go first, large
-		 * nodes go second, and small nodes go last.
-		 */
-		if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
-			if (inum != key_inum(c, &snod->key)) {
-				if (inum) {
-					/*
-					 * Try to move data nodes from the same
-					 * inode together.
-					 */
-					err = joinup(c, sleb, inum, blk, &data);
-					if (err)
-						goto out;
-				}
-				inum = key_inum(c, &snod->key);
-				blk = key_block(c, &snod->key);
-			}
-			list_add_tail(lst, &data);
-		} else if (snod->len > MEDIUM_NODE_WM)
-			list_add_tail(lst, &large);
-		else if (snod->len > SMALL_NODE_WM)
-			list_add_tail(lst, &medium);
-		else
-			list_add_tail(lst, &small);
-
-		/* And find the smallest node */
-		if (snod->len < min)
-			min = snod->len;
+		if (snod->len < *min)
+			*min = snod->len;
+
+		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
+			list_move_tail(&snod->list, nondata);
 	}
 
-	/*
-	 * Join the tree lists so that we'd have one roughly sorted list
-	 * ('large' will be the head of the joined list).
-	 */
-	list_splice(&data, &large);
-	list_splice(&medium, large.prev);
-	list_splice(&small, large.prev);
+	/* Sort data and non-data nodes */
+	list_sort(c, &sleb->nodes, &data_nodes_cmp);
+	list_sort(c, nondata, &nondata_nodes_cmp);
+	return 0;
+}
+
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+
+	cond_resched();
+	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+	if (err)
+		return err;
+
+	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+				snod->offs, new_lnum, new_offs,
+				snod->len);
+	list_del(&snod->list);
+	kfree(snod);
+	return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+	int err, min;
+	LIST_HEAD(nondata);
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 
 	if (wbuf->lnum == -1) {
 		/*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		 */
 		err = switch_gc_head(c);
 		if (err)
-			goto out;
+			return err;
 	}
 
+	err = sort_nodes(c, sleb, &nondata, &min);
+	if (err)
+		goto out;
+
 	/* Write nodes to their new location. Use the first-fit strategy */
 	while (1) {
-		avail = c->leb_size - wbuf->offs - wbuf->used;
-		list_for_each_entry_safe(snod, tmp, &large, list) {
-			int new_lnum, new_offs;
+		int avail;
+		struct ubifs_scan_node *snod, *tmp;
+
+		/* Move data nodes */
+		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if  (snod->len > avail)
+				/*
+				 * Do not skip data nodes in order to optimize
+				 * bulk-read.
+				 */
+				break;
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
 
+		/* Move non-data nodes */
+		list_for_each_entry_safe(snod, tmp, &nondata, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
 			if (avail < min)
 				break;
 
-			if (snod->len > avail)
-				/* This node does not fit */
+			if  (snod->len > avail) {
+				/*
+				 * Keep going only if this is an inode with
+				 * some data. Otherwise stop and switch the GC
+				 * head. IOW, we assume that data-less inode
+				 * nodes and direntry nodes are roughly of the
+				 * same size.
+				 */
+				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+				    snod->len == UBIFS_INO_NODE_SZ)
+					break;
 				continue;
+			}
 
-			cond_resched();
-
-			new_lnum = wbuf->lnum;
-			new_offs = wbuf->offs + wbuf->used;
-			err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-						      snod->len);
+			err = move_node(c, sleb, snod, wbuf);
 			if (err)
 				goto out;
-			err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-						snod->offs, new_lnum, new_offs,
-						snod->len);
-			if (err)
-				goto out;
-
-			avail = c->leb_size - wbuf->offs - wbuf->used;
-			list_del(&snod->list);
-			kfree(snod);
 		}
 
-		if (list_empty(&large))
+		if (list_empty(&sleb->nodes) && list_empty(&nondata))
 			break;
 
 		/*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 	return 0;
 
 out:
-	list_for_each_entry_safe(snod, tmp, &large, list) {
-		list_del(&snod->list);
-		kfree(snod);
-	}
+	list_splice_tail(&nondata, &sleb->nodes);
 	return err;
 }
 
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
  */
 static int reserve_space(struct ubifs_info *c, int jhead, int len)
 {
-	int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+	int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
 	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
 
 	/*
@@ -139,10 +139,9 @@ again:
 	 * Write buffer wasn't seek'ed or there is no enough space - look for an
 	 * LEB with some empty space.
 	 */
-	lnum = ubifs_find_free_space(c, len, &free, squeeze);
+	lnum = ubifs_find_free_space(c, len, &offs, squeeze);
 	if (lnum >= 0) {
 		/* Found an LEB, add it to the journal head */
-		offs = c->leb_size - free;
 		err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
 		if (err)
 			goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
  * @host: host inode
  *
  * This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
  * after the extended attribute inode in order to guarantee that the extended
  * attribute will be flushed when the inode is synchronized by 'fsync()' and
  * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
  * @c: UBIFS file-system description object
  * @key: the key to get hash from
  */
-static inline int key_hash(const struct ubifs_info *c,
-			   const union ubifs_key *key)
+static inline uint32_t key_hash(const struct ubifs_info *c,
+				const union ubifs_key *key)
 {
 	return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
 }
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @k: the key to get hash from
  */
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
 {
 	const union ubifs_key *key = k;
 
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	}
 
 	/*
-	 * Make sure the the amount of space in buds will not exceed
+	 * Make sure the amount of space in buds will not exceed the
 	 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
 	 * limits.
 	 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
 				bud->jhead, c->leb_size - bud->start,
 				c->cmt_bud_bytes);
 			rb_erase(p1, &c->buds);
-			list_del(&bud->list);
 			/*
 			 * If the commit does not finish, the recovery will need
 			 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
 			 * commit i.e. do not allow them to be garbage
 			 * collected.
 			 */
-			list_add(&bud->list, &c->old_buds);
+			list_move(&bud->list, &c->old_buds);
 		}
 	}
 	spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		while (offs + len > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->lsave_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->ltab_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
 						       alen, UBI_SHORTTERM);
 				if (err)
 					return err;
-				dbg_chk_lpt_sz(c, 4, alen - wlen);
 			}
-			dbg_chk_lpt_sz(c, 2, 0);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
-			from = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 /**
  * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
  * @c: the UBIFS file-system description object
- * @action: action
+ * @action: what to do
  * @len: length written
  *
  * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
  */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 				       lnum, offs);
 			err = ubifs_unpack_nnode(c, buf, &nnode);
 			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
-				printk("%d:%d", nnode.nbranch[i].lnum,
+				printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
 				       nnode.nbranch[i].offs);
 				if (i != UBIFS_LPT_FANOUT - 1)
-					printk(", ");
+					printk(KERN_CONT ", ");
 			}
-			printk("\n");
+			printk(KERN_CONT "\n");
 			break;
 		}
 		case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
  * @lnum: LEB number of the LEB from which @buf was read
  * @offs: offset from which @buf was read
  *
- * This function scans @buf for more nodes and returns %0 is a node is found and
- * %1 if no more nodes are found.
+ * This function ensures that the corrupted node at @offs is the last thing
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
  */
 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
 			int lnum, int offs)
 {
-	int skip, next_offs = 0;
+	struct ubifs_ch *ch = buf;
+	int skip, dlen = le32_to_cpu(ch->len);
 
-	if (len > UBIFS_DATA_NODE_SZ) {
-		struct ubifs_ch *ch = buf;
-		int dlen = le32_to_cpu(ch->len);
-
-		if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
-		    dlen <= UBIFS_MAX_DATA_NODE_SZ)
-			/* The corrupt node looks like a data node */
-			next_offs = ALIGN(offs + dlen, 8);
-	}
-
-	if (c->min_io_size == 1)
-		skip = 8;
-	else
-		skip = ALIGN(offs + 1, c->min_io_size) - offs;
-
-	offs += skip;
-	buf += skip;
-	len -= skip;
-	while (len > 8) {
-		struct ubifs_ch *ch = buf;
-		uint32_t magic = le32_to_cpu(ch->magic);
-		int ret;
-
-		if (magic == UBIFS_NODE_MAGIC) {
-			ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
-			if (ret == SCANNED_A_NODE || ret > 0) {
-				/*
-				 * There is a small chance this is just data in
-				 * a data node, so check that possibility. e.g.
-				 * this is part of a file that itself contains
-				 * a UBIFS image.
-				 */
-				if (next_offs && offs + le32_to_cpu(ch->len) <=
-				    next_offs)
-					continue;
-				dbg_rcvry("unexpected node at %d:%d", lnum,
-					  offs);
-				return 0;
-			}
-		}
-		offs += 8;
-		buf += 8;
-		len -= 8;
+	/* Check for empty space after the corrupt node's common header */
+	skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	/*
+	 * The area after the common header size is not empty, so the common
+	 * header must be intact. Check it.
+	 */
+	if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+		dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
+		return 0;
 	}
-	return 1;
+	/* Now we know the corrupt node's length we can skip over it */
+	skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+	/* After which there should be empty space */
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+	return 0;
 }
 
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
 		dirty -= c->leb_size - lp->free;
 		/*
 		 * If the replay order was perfect the dirty space would now be
-		 * zero. The order is not perfect because the the journal heads
+		 * zero. The order is not perfect because the journal heads
 		 * race with each other. This is not a problem but is does mean
 		 * that the dirty space may temporarily exceed c->leb_size
 		 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	if (tmp64 > DEFAULT_MAX_RP_SIZE)
 		tmp64 = DEFAULT_MAX_RP_SIZE;
 	sup->rp_size = cpu_to_le64(tmp64);
+	sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
 
 	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
 	kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	if (IS_ERR(sup))
 		return PTR_ERR(sup);
 
+	c->fmt_version = le32_to_cpu(sup->fmt_version);
+	c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
+
 	/*
 	 * The software supports all previous versions but not future versions,
 	 * due to the unavailability of time-travelling equipment.
 	 */
-	c->fmt_version = le32_to_cpu(sup->fmt_version);
 	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-		ubifs_err("on-flash format version is %d, but software only "
-			  "supports up to version %d", c->fmt_version,
-			  UBIFS_FORMAT_VERSION);
-		err = -EINVAL;
-		goto out;
+		struct super_block *sb = c->vfs_sb;
+		int mounting_ro = sb->s_flags & MS_RDONLY;
+
+		ubifs_assert(!c->ro_media || mounting_ro);
+		if (!mounting_ro ||
+		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+			ubifs_err("on-flash format version is w%d/r%d, but "
+				  "software only supports up to version "
+				  "w%d/r%d", c->fmt_version,
+				  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+				  UBIFS_RO_COMPAT_VERSION);
+			if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+				ubifs_msg("only R/O mounting is possible");
+				err = -EROFS;
+			} else
+				err = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * The FS is mounted R/O, and the media format is
+		 * R/O-compatible with the UBIFS implementation, so we can
+		 * mount.
+		 */
+		c->rw_incompat = 1;
 	}
 
 	if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
 	c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
 	c->main_first = c->leb_cnt - c->main_lebs;
-	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
 
 	err = validate_sb(c, sup);
 out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
 		 * Move this one to the end of the list to provide some
 		 * fairness.
 		 */
-		list_del(&c->infos_list);
-		list_add_tail(&c->infos_list, &ubifs_infos);
+		list_move_tail(&c->infos_list, &ubifs_infos);
 		mutex_unlock(&c->umount_mutex);
 		if (freed >= nr)
 			break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
 			}
 
 			if (i == 1) {
-				list_del(&c->infos_list);
-				list_add_tail(&c->infos_list, &ubifs_infos);
+				list_move_tail(&c->infos_list, &ubifs_infos);
 				spin_unlock(&ubifs_infos_lock);
 
 				ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..e9f7a754c4f7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",no_chk_data_crc");
 
 	if (c->mount_opts.override_compr) {
-		seq_printf(s, ",compr=");
-		seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+		seq_printf(s, ",compr=%s",
+			   ubifs_compr_name(c->mount_opts.compr_type));
 	}
 
 	return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
 	if (err)
 		return err;
 
+	/* Initialize effective LEB size used in budgeting calculations */
+	c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
 	return 0;
 }
 
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
 	long long tmp64;
 
 	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
 
 	/*
 	 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_cbuf;
 
 		/* Create background thread */
-		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+		c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
 		if (IS_ERR(c->bgt)) {
 			err = PTR_ERR(c->bgt);
 			c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
 		else {
 			c->need_recovery = 0;
 			ubifs_msg("recovery completed");
-			/* GC LEB has to be empty and taken at this point */
-			ubifs_assert(c->lst.taken_empty_lebs == 1);
+			/*
+			 * GC LEB has to be empty and taken at this point. But
+			 * the journal head LEBs may also be accounted as
+			 * "empty taken" if they are empty.
+			 */
+			ubifs_assert(c->lst.taken_empty_lebs > 0);
 		}
 	} else
-		ubifs_assert(c->lst.taken_empty_lebs == 1);
+		ubifs_assert(c->lst.taken_empty_lebs > 0);
 
 	err = dbg_check_filesystem(c);
 	if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
 	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
 	ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
 		  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-	ubifs_msg("media format:       %d (latest is %d)",
-		  c->fmt_version, UBIFS_FORMAT_VERSION);
+	ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
+		  c->fmt_version, c->ro_compat_version,
+		  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
 	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
 	ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
 		c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
 	int err, lnum;
 
+	if (c->rw_incompat) {
+		ubifs_err("the file-system is not R/W-compatible");
+		ubifs_msg("on-flash format version is w%d/r%d, but software "
+			  "only supports up to version w%d/r%d", c->fmt_version,
+			  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+			  UBIFS_RO_COMPAT_VERSION);
+		return -EROFS;
+	}
+
 	mutex_lock(&c->umount_mutex);
 	dbg_save_space_info(c);
 	c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	ubifs_create_buds_lists(c);
 
 	/* Create background thread */
-	c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+	c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
 	if (IS_ERR(c->bgt)) {
 		err = PTR_ERR(c->bgt);
 		c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		c->bu.buf = NULL;
 	}
 
-	ubifs_assert(c->lst.taken_empty_lebs == 1);
+	ubifs_assert(c->lst.taken_empty_lebs > 0);
 	return 0;
 }
 
@@ -2038,8 +2055,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	return 0;
 
 out_deact:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
+	deactivate_locked_super(sb);
 out_close:
 	ubi_close_volume(ubi);
 	return err;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 	 * splitting in the middle of the colliding sequence. Also, when
 	 * removing the leftmost key, we would have to correct the key of the
 	 * parent node, which would introduce additional complications. Namely,
-	 * if we changed the the leftmost key of the parent znode, the garbage
+	 * if we changed the leftmost key of the parent znode, the garbage
 	 * collector would be unable to find it (GC is doing this when GC'ing
 	 * indexing LEBs). Although we already have an additional RB-tree where
 	 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
 /* UBIFS node magic number (must not have the padding byte first or last) */
 #define UBIFS_NODE_MAGIC  0x06101831
 
-/* UBIFS on-flash format version */
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
 #define UBIFS_FORMAT_VERSION 4
 
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
+
 /* Minimum logical eraseblock size in bytes */
 #define UBIFS_MIN_LEB_SZ (15*1024)
 
@@ -53,7 +75,7 @@
 
 /*
  * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
- * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
  * node uncompress, because it'll be read faster.
  */
 #define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
  * @padding2: reserved for future, zeroes
  * @time_gran: time granularity in nanoseconds
  * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
  */
 struct ubifs_sb_node {
 	struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
 	__le64 rp_size;
 	__le32 time_gran;
 	__u8 uuid[16];
-	__u8 padding2[3972];
+	__le32 ro_compat_version;
+	__u8 padding2[3968];
 } __attribute__ ((packed));
 
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
  *          by @commit_sem
  * @cnt_lock: protects @highest_inum and @max_sqnum counters
  * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
  * @uuid: UUID from super block
  *
  * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
  *                   recovery)
  * @bulk_read: enable bulk-reads
  * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
  *
  * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
  *             @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
  * @min_io_shift: number of bits in @min_io_size minus one
  * @leb_size: logical eraseblock size in bytes
  * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
  * @leb_cnt: count of logical eraseblocks
  * @max_leb_cnt: maximum count of logical eraseblocks
  * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
  *             previous commit start
  * @uncat_list: list of un-categorized LEBs
  * @empty_list: list of empty LEBs
- * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
- * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
  * @freeable_cnt: number of freeable LEBs in @freeable_list
  *
  * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
 	unsigned long long cmt_no;
 	spinlock_t cnt_lock;
 	int fmt_version;
+	int ro_compat_version;
 	unsigned char uuid[16];
 
 	int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
 	unsigned int default_compr:2;
+	unsigned int rw_incompat:1;
 
 	struct mutex tnc_mutex;
 	struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
 	int min_io_shift;
 	int leb_size;
 	int half_leb_size;
+	int idx_leb_size;
 	int leb_cnt;
 	int max_leb_cnt;
 	int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 
 /* find.c */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 			  int squeeze);
 int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index dbbbc4668769..6321b797061b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
-	.fsync		= file_fsync,
+	.fsync		= ufs_sync_file,
 	.llseek		= generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 625ef17c6f83..2bd3a1615714 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -30,7 +30,7 @@
 #include "ufs.h"
 
 
-static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	int err;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 69b3427d7885..d0c4acd4f1f3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -98,8 +98,8 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 /* file.c */
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
-
 extern const struct address_space_operations ufs_aops;
+extern int ufs_sync_file(struct file *, struct dentry *, int);
 
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 197c4fcac032..d51b8f9db921 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 	if (size) {
 		if (size > XATTR_SIZE_MAX)
 			return -E2BIG;
-		kvalue = kmalloc(size, GFP_KERNEL);
-		if (!kvalue)
-			return -ENOMEM;
-		if (copy_from_user(kvalue, value, size)) {
-			kfree(kvalue);
-			return -EFAULT;
-		}
+		kvalue = memdup_user(value, size);
+		if (IS_ERR(kvalue))
+			return PTR_ERR(kvalue);
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c13f67300fe7..7ec89fc05b2b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -153,23 +153,6 @@ xfs_find_bdev_for_inode(
 }
 
 /*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
-	xfs_ioend_t	*ioend,
-	int		wait)
-{
-	if (atomic_dec_and_test(&ioend->io_remaining)) {
-		queue_work(xfsdatad_workqueue, &ioend->io_work);
-		if (wait)
-			flush_workqueue(xfsdatad_workqueue);
-	}
-}
-
-/*
  * We're now finished for good with this ioend structure.
  * Update the page state via the associated buffer_heads,
  * release holds on the inode and bio, and finally free
@@ -310,6 +293,27 @@ xfs_end_bio_read(
 }
 
 /*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+	xfs_ioend_t	*ioend,
+	int		wait)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining)) {
+		struct workqueue_struct *wq = xfsdatad_workqueue;
+		if (ioend->io_work.func == xfs_end_bio_unwritten)
+			wq = xfsconvertd_workqueue;
+
+		queue_work(wq, &ioend->io_work);
+		if (wait)
+			flush_workqueue(wq);
+	}
+}
+
+/*
  * Allocate and initialise an IO completion structure.
  * We need to track unwritten extent write completion here initially.
  * We'll need to extend this for updating the ondisk inode size later
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 1dd528849755..221b3e66ceef 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -19,6 +19,7 @@
 #define __XFS_AOPS_H__
 
 extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1016bb9134..e28800a9f2b5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = {
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
 
 #ifdef XFS_BUF_TRACE
 void
@@ -1775,6 +1776,7 @@ xfs_flush_buftarg(
 	xfs_buf_t	*bp, *n;
 	int		pincount = 0;
 
+	xfs_buf_runall_queues(xfsconvertd_workqueue);
 	xfs_buf_runall_queues(xfsdatad_workqueue);
 	xfs_buf_runall_queues(xfslogd_workqueue);
 
@@ -1831,9 +1833,15 @@ xfs_buf_init(void)
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
+	xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+	if (!xfsconvertd_workqueue)
+		goto out_destroy_xfsdatad_workqueue;
+
 	register_shrinker(&xfs_buf_shake);
 	return 0;
 
+ out_destroy_xfsdatad_workqueue:
+	destroy_workqueue(xfsdatad_workqueue);
  out_destroy_xfslogd_workqueue:
 	destroy_workqueue(xfslogd_workqueue);
  out_free_buf_zone:
@@ -1849,6 +1857,7 @@ void
 xfs_buf_terminate(void)
 {
 	unregister_shrinker(&xfs_buf_shake);
+	destroy_workqueue(xfsconvertd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
 	kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 5aeb77776961..08be36d7326c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -74,14 +74,14 @@ xfs_flush_pages(
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_fdatawrite(mapping);
-		if (flags & XFS_B_ASYNC)
-			return -ret;
-		ret2 = filemap_fdatawait(mapping);
-		if (!ret)
-			ret = ret2;
+		ret = -filemap_fdatawrite(mapping);
 	}
-	return -ret;
+	if (flags & XFS_B_ASYNC)
+		return ret;
+	ret2 = xfs_wait_on_pages(ip, first, last);
+	if (!ret)
+		ret = ret2;
+	return ret;
 }
 
 int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b499418a7d..34eaab608e6e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -489,17 +489,12 @@ xfs_attrmulti_attr_set(
 	if (len > XATTR_SIZE_MAX)
 		return EINVAL;
 
-	kbuf = kmalloc(len, GFP_KERNEL);
-	if (!kbuf)
-		return ENOMEM;
-
-	if (copy_from_user(kbuf, ubuf, len))
-		goto out_kfree;
+	kbuf = memdup_user(ubuf, len);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
 
 	error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
 
- out_kfree:
-	kfree(kbuf);
 	return error;
 }
 
@@ -540,20 +535,16 @@ xfs_attrmulti_by_handle(
 	if (!size || size > 16 * PAGE_SIZE)
 		goto out_dput;
 
-	error = ENOMEM;
-	ops = kmalloc(size, GFP_KERNEL);
-	if (!ops)
+	ops = memdup_user(am_hreq.ops, size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
 		goto out_dput;
-
-	error = EFAULT;
-	if (copy_from_user(ops, am_hreq.ops, size))
-		goto out_kfree_ops;
+	}
 
 	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
 	if (!attr_name)
 		goto out_kfree_ops;
 
-
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
 		ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c70c4e3db790..0882d166239a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle(
 	if (!size || size > 16 * PAGE_SIZE)
 		goto out_dput;
 
-	error = ENOMEM;
-	ops = kmalloc(size, GFP_KERNEL);
-	if (!ops)
+	ops = memdup_user(compat_ptr(am_hreq.ops), size);
+	if (IS_ERR(ops)) {
+		error = PTR_ERR(ops);
 		goto out_dput;
-
-	error = EFAULT;
-	if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
-		goto out_kfree_ops;
+	}
 
 	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
 	if (!attr_name)
 		goto out_kfree_ops;
 
-
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
 		ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7e90daa0d1d1..9142192ccbe6 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -751,10 +751,26 @@ start:
 			goto relock;
 		}
 	} else {
+		int enospc = 0;
+		ssize_t ret2 = 0;
+
+write_retry:
 		xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
 				*offset, ioflags);
-		ret = generic_file_buffered_write(iocb, iovp, segs,
+		ret2 = generic_file_buffered_write(iocb, iovp, segs,
 				pos, offset, count, ret);
+		/*
+		 * if we just got an ENOSPC, flush the inode now we
+		 * aren't holding any page locks and retry *once*
+		 */
+		if (ret2 == -ENOSPC && !enospc) {
+			error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
+			if (error)
+				goto out_unlock_internal;
+			enospc = 1;
+			goto write_retry;
+		}
+		ret = ret2;
 	}
 
 	current->backing_dev_info = NULL;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a608e72fa405..f7ba76633c29 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
 	uint32_t	first_index = 0;
 	int		error = 0;
 	int		last_error = 0;
-	int		fflag = XFS_B_ASYNC;
-
-	if (flags & SYNC_DELWRI)
-		fflag = XFS_B_DELWRI;
-	if (flags & SYNC_WAIT)
-		fflag = 0;		/* synchronous overrides all */
 
 	do {
 		struct inode	*inode;
@@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
 		 * If we have to flush data or wait for I/O completion
 		 * we need to hold the iolock.
 		 */
-		if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
-			xfs_ilock(ip, XFS_IOLOCK_SHARED);
-			lock_flags |= XFS_IOLOCK_SHARED;
-			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
-			if (flags & SYNC_IOWAIT)
+		if (flags & SYNC_DELWRI) {
+			if (VN_DIRTY(inode)) {
+				if (flags & SYNC_TRYLOCK) {
+					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+						lock_flags |= XFS_IOLOCK_SHARED;
+				} else {
+					xfs_ilock(ip, XFS_IOLOCK_SHARED);
+					lock_flags |= XFS_IOLOCK_SHARED;
+				}
+				if (lock_flags & XFS_IOLOCK_SHARED) {
+					error = xfs_flush_pages(ip, 0, -1,
+							(flags & SYNC_WAIT) ? 0
+								: XFS_B_ASYNC,
+							FI_NONE);
+				}
+			}
+			if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
 				xfs_ioend_wait(ip);
 		}
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -398,15 +404,17 @@ STATIC void
 xfs_syncd_queue_work(
 	struct xfs_mount *mp,
 	void		*data,
-	void		(*syncer)(struct xfs_mount *, void *))
+	void		(*syncer)(struct xfs_mount *, void *),
+	struct completion *completion)
 {
-	struct bhv_vfs_sync_work *work;
+	struct xfs_sync_work *work;
 
-	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+	work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
 	INIT_LIST_HEAD(&work->w_list);
 	work->w_syncer = syncer;
 	work->w_data = data;
 	work->w_mount = mp;
+	work->w_completion = completion;
 	spin_lock(&mp->m_sync_lock);
 	list_add_tail(&work->w_list, &mp->m_sync_list);
 	spin_unlock(&mp->m_sync_lock);
@@ -420,49 +428,26 @@ xfs_syncd_queue_work(
  * heads, looking about for more room...
  */
 STATIC void
-xfs_flush_inode_work(
-	struct xfs_mount *mp,
-	void		*arg)
-{
-	struct inode	*inode = arg;
-	filemap_flush(inode->i_mapping);
-	iput(inode);
-}
-
-void
-xfs_flush_inode(
-	xfs_inode_t	*ip)
-{
-	struct inode	*inode = VFS_I(ip);
-
-	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-	delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
+xfs_flush_inodes_work(
 	struct xfs_mount *mp,
 	void		*arg)
 {
 	struct inode	*inode = arg;
-	sync_blockdev(mp->m_super->s_bdev);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
 	iput(inode);
 }
 
 void
-xfs_flush_device(
+xfs_flush_inodes(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = VFS_I(ip);
+	DECLARE_COMPLETION_ONSTACK(completion);
 
 	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
-	delay(msecs_to_jiffies(500));
+	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
+	wait_for_completion(&completion);
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
 
@@ -497,7 +482,7 @@ xfssyncd(
 {
 	struct xfs_mount	*mp = arg;
 	long			timeleft;
-	bhv_vfs_sync_work_t	*work, *n;
+	xfs_sync_work_t		*work, *n;
 	LIST_HEAD		(tmp);
 
 	set_freezable();
@@ -532,6 +517,8 @@ xfssyncd(
 			list_del(&work->w_list);
 			if (work == &mp->m_sync_work)
 				continue;
+			if (work->w_completion)
+				complete(work->w_completion);
 			kmem_free(work);
 		}
 	}
@@ -545,6 +532,7 @@ xfs_syncd_init(
 {
 	mp->m_sync_work.w_syncer = xfs_sync_worker;
 	mp->m_sync_work.w_mount = mp;
+	mp->m_sync_work.w_completion = NULL;
 	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
 	if (IS_ERR(mp->m_sync_task))
 		return -PTR_ERR(mp->m_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 04f058c848ae..308d5bf6dfbd 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -21,18 +21,20 @@
 struct xfs_mount;
 struct xfs_perag;
 
-typedef struct bhv_vfs_sync_work {
+typedef struct xfs_sync_work {
 	struct list_head	w_list;
 	struct xfs_mount	*w_mount;
 	void			*w_data;	/* syncer routine argument */
 	void			(*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
+	struct completion	*w_completion;
+} xfs_sync_work_t;
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
 #define SYNC_DELWRI		0x0002	/* look at delayed writes */
 #define SYNC_WAIT		0x0004	/* wait for i/o to complete */
 #define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
 #define SYNC_IOWAIT		0x0010  /* wait for all I/O to complete */
+#define SYNC_TRYLOCK		0x0020  /* only try to lock inodes */
 
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
@@ -43,8 +45,7 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
-void xfs_flush_inode(struct xfs_inode *ip);
-void xfs_flush_device(struct xfs_inode *ip);
+void xfs_flush_inodes(struct xfs_inode *ip);
 
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3a6ed426327a..ca7c6005a487 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5880,7 +5880,7 @@ xfs_getbmap(
 	void			*arg)		/* formatter arg */
 {
 	__int64_t		bmvend;		/* last block requested */
-	int			error;		/* return value */
+	int			error = 0;	/* return value */
 	__int64_t		fixlen;		/* length for -1 case */
 	int			i;		/* extent number */
 	int			lock;		/* lock state */
@@ -5890,39 +5890,18 @@ xfs_getbmap(
 	int			nexleft;	/* # of user extents left */
 	int			subnex;		/* # of bmapi's can do */
 	int			nmap;		/* number of map entries */
-	struct getbmapx		out;		/* output structure */
+	struct getbmapx		*out;		/* output structure */
 	int			whichfork;	/* data or attr fork */
 	int			prealloced;	/* this is a file with
 						 * preallocated data space */
 	int			iflags;		/* interface flags */
 	int			bmapi_flags;	/* flags for xfs_bmapi */
+	int			cur_ext = 0;
 
 	mp = ip->i_mount;
 	iflags = bmv->bmv_iflags;
-
 	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
 
-	/*	If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
-	 *	generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
-	 *	bit is set for the file, generate a read event in order
-	 *	that the DMAPI application may do its thing before we return
-	 *	the extents.  Usually this means restoring user file data to
-	 *	regions of the file that look like holes.
-	 *
-	 *	The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
-	 *	BMV_IF_NO_DMAPI_READ so that read events are generated.
-	 *	If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
-	 *	could misinterpret holes in a DMAPI file as true holes,
-	 *	when in fact they may represent offline user data.
-	 */
-	if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
-	    DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
-	    whichfork == XFS_DATA_FORK) {
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
-		if (error)
-			return XFS_ERROR(error);
-	}
-
 	if (whichfork == XFS_ATTR_FORK) {
 		if (XFS_IFORK_Q(ip)) {
 			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
@@ -5936,11 +5915,37 @@ xfs_getbmap(
 					 ip->i_mount);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
-	} else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
-		   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
-		   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
-		return XFS_ERROR(EINVAL);
-	if (whichfork == XFS_DATA_FORK) {
+
+		prealloced = 0;
+		fixlen = 1LL << 32;
+	} else {
+		/*
+		 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
+		 * not generate a DMAPI read event.  Otherwise, if the
+		 * DM_EVENT_READ bit is set for the file, generate a read
+		 * event in order that the DMAPI application may do its thing
+		 * before we return the extents.  Usually this means restoring
+		 * user file data to regions of the file that look like holes.
+		 *
+		 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
+		 * BMV_IF_NO_DMAPI_READ so that read events are generated.
+		 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
+		 * could misinterpret holes in a DMAPI file as true holes,
+		 * when in fact they may represent offline user data.
+		 */
+		if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
+		    !(iflags & BMV_IF_NO_DMAPI_READ)) {
+			error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
+					      0, 0, 0, NULL);
+			if (error)
+				return XFS_ERROR(error);
+		}
+
+		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+			return XFS_ERROR(EINVAL);
+
 		if (xfs_get_extsz_hint(ip) ||
 		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
 			prealloced = 1;
@@ -5949,42 +5954,41 @@ xfs_getbmap(
 			prealloced = 0;
 			fixlen = ip->i_size;
 		}
-	} else {
-		prealloced = 0;
-		fixlen = 1LL << 32;
 	}
 
 	if (bmv->bmv_length == -1) {
 		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
-		bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
-					(__int64_t)0);
-	} else if (bmv->bmv_length < 0)
-		return XFS_ERROR(EINVAL);
-	if (bmv->bmv_length == 0) {
+		bmv->bmv_length =
+			max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+	} else if (bmv->bmv_length == 0) {
 		bmv->bmv_entries = 0;
 		return 0;
+	} else if (bmv->bmv_length < 0) {
+		return XFS_ERROR(EINVAL);
 	}
+
 	nex = bmv->bmv_count - 1;
 	if (nex <= 0)
 		return XFS_ERROR(EINVAL);
 	bmvend = bmv->bmv_offset + bmv->bmv_length;
 
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
-	if (((iflags & BMV_IF_DELALLOC) == 0) &&
-	    (whichfork == XFS_DATA_FORK) &&
-	    (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
-		/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
-		error = xfs_flush_pages(ip, (xfs_off_t)0,
-					       -1, 0, FI_REMAPF);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-		return error;
+	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+		return XFS_ERROR(ENOMEM);
+	out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
+	if (!out)
+		return XFS_ERROR(ENOMEM);
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+		if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+			error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+			if (error)
+				goto out_unlock_iolock;
 		}
-	}
 
-	ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
-	       ip->i_delayed_blks == 0);
+		ASSERT(ip->i_delayed_blks == 0);
+	}
 
 	lock = xfs_ilock_map_shared(ip);
 
@@ -5995,23 +5999,25 @@ xfs_getbmap(
 	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
 		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
 
-	bmapi_flags = xfs_bmapi_aflag(whichfork) |
-			((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
+	bmapi_flags = xfs_bmapi_aflag(whichfork);
+	if (!(iflags & BMV_IF_PREALLOC))
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
 
 	/*
 	 * Allocate enough space to handle "subnex" maps at a time.
 	 */
+	error = ENOMEM;
 	subnex = 16;
-	map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
+	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+	if (!map)
+		goto out_unlock_ilock;
 
 	bmv->bmv_entries = 0;
 
-	if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
-		if (((iflags & BMV_IF_DELALLOC) == 0) ||
-		    whichfork == XFS_ATTR_FORK) {
-			error = 0;
-			goto unlock_and_return;
-		}
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+		error = 0;
+		goto out_free_map;
 	}
 
 	nexleft = nex;
@@ -6023,53 +6029,61 @@ xfs_getbmap(
 				  bmapi_flags, NULL, 0, map, &nmap,
 				  NULL, NULL);
 		if (error)
-			goto unlock_and_return;
+			goto out_free_map;
 		ASSERT(nmap <= subnex);
 
 		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-			out.bmv_oflags = 0;
+			out[cur_ext].bmv_oflags = 0;
 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
-				out.bmv_oflags |= BMV_OF_PREALLOC;
+				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
 			else if (map[i].br_startblock == DELAYSTARTBLOCK)
-				out.bmv_oflags |= BMV_OF_DELALLOC;
-			out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
-			out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-			out.bmv_unused1 = out.bmv_unused2 = 0;
+				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+			out[cur_ext].bmv_offset =
+				XFS_FSB_TO_BB(mp, map[i].br_startoff);
+			out[cur_ext].bmv_length =
+				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			out[cur_ext].bmv_unused1 = 0;
+			out[cur_ext].bmv_unused2 = 0;
 			ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
 			      (map[i].br_startblock != DELAYSTARTBLOCK));
                         if (map[i].br_startblock == HOLESTARTBLOCK &&
 			    whichfork == XFS_ATTR_FORK) {
 				/* came to the end of attribute fork */
-				out.bmv_oflags |= BMV_OF_LAST;
-				goto unlock_and_return;
-			} else {
-				int full = 0;	/* user array is full */
-
-				if (!xfs_getbmapx_fix_eof_hole(ip, &out,
-							prealloced, bmvend,
-							map[i].br_startblock)) {
-					goto unlock_and_return;
-				}
-
-				/* format results & advance arg */
-				error = formatter(&arg, &out, &full);
-				if (error || full)
-					goto unlock_and_return;
-				nexleft--;
-				bmv->bmv_offset =
-					out.bmv_offset + out.bmv_length;
-				bmv->bmv_length = MAX((__int64_t)0,
-					(__int64_t)(bmvend - bmv->bmv_offset));
-				bmv->bmv_entries++;
+				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+				goto out_free_map;
 			}
+
+			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+					prealloced, bmvend,
+					map[i].br_startblock))
+				goto out_free_map;
+
+			nexleft--;
+			bmv->bmv_offset =
+				out[cur_ext].bmv_offset +
+				out[cur_ext].bmv_length;
+			bmv->bmv_length =
+				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+			bmv->bmv_entries++;
+			cur_ext++;
 		}
 	} while (nmap && nexleft && bmv->bmv_length);
 
-unlock_and_return:
+ out_free_map:
+	kmem_free(map);
+ out_unlock_ilock:
 	xfs_iunlock_map_shared(ip, lock);
+ out_unlock_iolock:
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
-	kmem_free(map);
+	for (i = 0; i < cur_ext; i++) {
+		int full = 0;	/* user array is full */
+
+		/* format results & advance arg */
+		error = formatter(&arg, &out[i], &full);
+		if (error || full)
+			break;
+	}
 
 	return error;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 478e587087fe..89b81eedce6a 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -69,15 +69,6 @@ xfs_inode_alloc(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
-	/*
-	 * initialise the VFS inode here to get failures
-	 * out of the way early.
-	 */
-	if (!inode_init_always(mp->m_super, VFS_I(ip))) {
-		kmem_zone_free(xfs_inode_zone, ip);
-		return NULL;
-	}
-
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
 	ip->i_mount = mp;
@@ -113,6 +104,20 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
 	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
+	/*
+	* Now initialise the VFS inode. We do this after the xfs_inode
+	* initialisation as internal failures will result in ->destroy_inode
+	* being called and that will pass down through the reclaim path and
+	* free the XFS inode. This path requires the XFS inode to already be
+	* initialised. Hence if this call fails, the xfs_inode has already
+	* been freed and we should not reference it at all in the error
+	* handling.
+	*/
+	if (!inode_init_always(mp->m_super, VFS_I(ip)))
+		return NULL;
+
+	/* prevent anyone from using this yet */
+	VFS_I(ip)->i_state = I_NEW|I_LOCK;
 
 	return ip;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e7ae08d1df48..123b20c8cbf2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1258,8 +1258,10 @@ xfs_file_last_byte(
 	 * necessary.
 	 */
 	if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
 		error = xfs_bmap_last_offset(NULL, ip, &last_block,
 			XFS_DATA_FORK);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		if (error) {
 			last_block = 0;
 		}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 08ce72316bfe..5aaa2d7ec155 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -338,38 +338,6 @@ xfs_iomap_eof_align_last_fsb(
 }
 
 STATIC int
-xfs_flush_space(
-	xfs_inode_t	*ip,
-	int		*fsynced,
-	int		*ioflags)
-{
-	switch (*fsynced) {
-	case 0:
-		if (ip->i_delayed_blks) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inode(ip);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			*fsynced = 1;
-		} else {
-			*ioflags |= BMAPI_SYNC;
-			*fsynced = 2;
-		}
-		return 0;
-	case 1:
-		*fsynced = 2;
-		*ioflags |= BMAPI_SYNC;
-		return 0;
-	case 2:
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_flush_device(ip);
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		*fsynced = 3;
-		return 0;
-	}
-	return 1;
-}
-
-STATIC int
 xfs_cmn_err_fsblock_zero(
 	xfs_inode_t	*ip,
 	xfs_bmbt_irec_t	*imap)
@@ -538,15 +506,9 @@ error_out:
 }
 
 /*
- * If the caller is doing a write at the end of the file,
- * then extend the allocation out to the file system's write
- * iosize.  We clean up any extra space left over when the
- * file is closed in xfs_inactive().
- *
- * For sync writes, we are flushing delayed allocate space to
- * try to make additional space available for allocation near
- * the filesystem full boundary - preallocation hurts in that
- * situation, of course.
+ * If the caller is doing a write at the end of the file, then extend the
+ * allocation out to the file system's write iosize.  We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
  */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate(
 	int		n, error, imaps;
 
 	*prealloc = 0;
-	if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
+	if ((offset + count) <= ip->i_size)
 		return 0;
 
 	/*
@@ -611,7 +573,7 @@ xfs_iomap_write_delay(
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc, fsynced = 0;
+	int		prealloc, flushed = 0;
 	int		error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -627,12 +589,12 @@ xfs_iomap_write_delay(
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-retry:
 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
 				ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
 	if (error)
 		return error;
 
+retry:
 	if (prealloc) {
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
@@ -659,15 +621,22 @@ retry:
 
 	/*
 	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-	 * then we must have run out of space - flush delalloc, and retry..
+	 * then we must have run out of space - flush all other inodes with
+	 * delalloc blocks and retry without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
 					ip, offset, count);
-		if (xfs_flush_space(ip, &fsynced, &ioflag))
+		if (flushed)
 			return XFS_ERROR(ENOSPC);
 
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_flush_inodes(ip);
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		flushed = 1;
 		error = 0;
+		prealloc = 0;
 		goto retry;
 	}
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index a1cc1322fc0f..fdcf7b82747f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -40,8 +40,7 @@ typedef enum {
 	BMAPI_IGNSTATE = (1 << 4),	/* ignore unwritten state on read */
 	BMAPI_DIRECT = (1 << 5),	/* direct instead of buffered write */
 	BMAPI_MMAP = (1 << 6),		/* allocate for mmap write */
-	BMAPI_SYNC = (1 << 7),		/* sync write to flush delalloc space */
-	BMAPI_TRYLOCK = (1 << 8),	/* non-blocking request */
+	BMAPI_TRYLOCK = (1 << 7),	/* non-blocking request */
 } bmapi_flags_t;
 
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f76c6d7cea21..3750f04ede0b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -562,9 +562,8 @@ xfs_log_mount(
 	}
 
 	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
-	if (!mp->m_log) {
-		cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
-		error = ENOMEM;
+	if (IS_ERR(mp->m_log)) {
+		error = -PTR_ERR(mp->m_log);
 		goto out;
 	}
 
@@ -1180,10 +1179,13 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xfs_buf_t		*bp;
 	int			i;
 	int			iclogsize;
+	int			error = ENOMEM;
 
 	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
-	if (!log)
-		return NULL;
+	if (!log) {
+		xlog_warn("XFS: Log allocation failed: No memory!");
+		goto out;
+	}
 
 	log->l_mp	   = mp;
 	log->l_targ	   = log_target;
@@ -1201,19 +1203,35 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_grant_reserve_cycle = 1;
 	log->l_grant_write_cycle = 1;
 
+	error = EFSCORRUPTED;
 	if (xfs_sb_version_hassector(&mp->m_sb)) {
 		log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
-		ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
+		if (log->l_sectbb_log < 0 ||
+		    log->l_sectbb_log > mp->m_sectbb_log) {
+			xlog_warn("XFS: Log sector size (0x%x) out of range.",
+						log->l_sectbb_log);
+			goto out_free_log;
+		}
+
 		/* for larger sector sizes, must have v2 or external log */
-		ASSERT(log->l_sectbb_log == 0 ||
-			log->l_logBBstart == 0 ||
-			xfs_sb_version_haslogv2(&mp->m_sb));
-		ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
+		if (log->l_sectbb_log != 0 &&
+		    (log->l_logBBstart != 0 &&
+		     !xfs_sb_version_haslogv2(&mp->m_sb))) {
+			xlog_warn("XFS: log sector size (0x%x) invalid "
+				  "for configuration.", log->l_sectbb_log);
+			goto out_free_log;
+		}
+		if (mp->m_sb.sb_logsectlog < BBSHIFT) {
+			xlog_warn("XFS: Log sector log (0x%x) too small.",
+						mp->m_sb.sb_logsectlog);
+			goto out_free_log;
+		}
 	}
 	log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
 
 	xlog_get_iclog_buffer_size(mp, log);
 
+	error = ENOMEM;
 	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
 	if (!bp)
 		goto out_free_log;
@@ -1313,7 +1331,8 @@ out_free_iclog:
 	xfs_buf_free(log->l_xbuf);
 out_free_log:
 	kmem_free(log);
-	return NULL;
+out:
+	return ERR_PTR(-error);
 }	/* xlog_alloc_log */
 
 
@@ -2541,18 +2560,19 @@ redo:
 			xlog_ins_ticketq(&log->l_reserve_headq, tic);
 		xlog_trace_loggrant(log, tic,
 				    "xlog_grant_log_space: sleep 2");
+		spin_unlock(&log->l_grant_lock);
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		spin_lock(&log->l_grant_lock);
+
 		XFS_STATS_INC(xs_sleep_logspace);
 		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
-		if (XLOG_FORCED_SHUTDOWN(log)) {
-			spin_lock(&log->l_grant_lock);
+		spin_lock(&log->l_grant_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
-		}
 
 		xlog_trace_loggrant(log, tic,
 				    "xlog_grant_log_space: wake 2");
-		xlog_grant_push_ail(log->l_mp, need_bytes);
-		spin_lock(&log->l_grant_lock);
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2631,7 +2651,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 	 * for more free space, otherwise try to get some space for
 	 * this transaction.
 	 */
-
+	need_bytes = tic->t_unit_res;
 	if ((ntic = log->l_write_headq)) {
 		free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
 					     log->l_grant_write_bytes);
@@ -2651,26 +2671,25 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 
 			xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: sleep 1");
+			spin_unlock(&log->l_grant_lock);
+			xlog_grant_push_ail(log->l_mp, need_bytes);
+			spin_lock(&log->l_grant_lock);
+
 			XFS_STATS_INC(xs_sleep_logspace);
 			sv_wait(&tic->t_wait, PINOD|PLTWAIT,
 				&log->l_grant_lock, s);
 
 			/* If we're shutting down, this tic is already
 			 * off the queue */
-			if (XLOG_FORCED_SHUTDOWN(log)) {
-				spin_lock(&log->l_grant_lock);
+			spin_lock(&log->l_grant_lock);
+			if (XLOG_FORCED_SHUTDOWN(log))
 				goto error_return;
-			}
 
 			xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: wake 1");
-			xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
-			spin_lock(&log->l_grant_lock);
 		}
 	}
 
-	need_bytes = tic->t_unit_res;
-
 redo:
 	if (XLOG_FORCED_SHUTDOWN(log))
 		goto error_return;
@@ -2680,19 +2699,20 @@ redo:
 	if (free_bytes < need_bytes) {
 		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
 			xlog_ins_ticketq(&log->l_write_headq, tic);
+		spin_unlock(&log->l_grant_lock);
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		spin_lock(&log->l_grant_lock);
+
 		XFS_STATS_INC(xs_sleep_logspace);
 		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
 		/* If we're shutting down, this tic is already off the queue */
-		if (XLOG_FORCED_SHUTDOWN(log)) {
-			spin_lock(&log->l_grant_lock);
+		spin_lock(&log->l_grant_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
-		}
 
 		xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: wake 2");
-		xlog_grant_push_ail(log->l_mp, need_bytes);
-		spin_lock(&log->l_grant_lock);
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_write_headq, tic);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b101990df027..65a99725d0cc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -291,14 +291,17 @@ xfs_mount_validate_sb(
 	    sbp->sb_sectsize > XFS_MAX_SECTORSIZE			||
 	    sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG			||
 	    sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG			||
+	    sbp->sb_sectsize != (1 << sbp->sb_sectlog)			||
 	    sbp->sb_blocksize < XFS_MIN_BLOCKSIZE			||
 	    sbp->sb_blocksize > XFS_MAX_BLOCKSIZE			||
 	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
 	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
 	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
 	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
 	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
 	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
+	    sbp->sb_inodesize != (1 << sbp->sb_inodelog)		||
 	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7af44adffc8f..d6a64392f983 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -313,7 +313,7 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct task_struct	*m_sync_task;	/* generalised sync thread */
-	bhv_vfs_sync_work_t	m_sync_work;	/* work item for VFS_SYNC */
+	xfs_sync_work_t		m_sync_work;	/* work item for VFS_SYNC */
 	struct list_head	m_sync_list;	/* sync thread work item list */
 	spinlock_t		m_sync_lock;	/* work item list lock */
 	int			m_sync_seq;	/* sync thread generation no. */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7394c7af5de5..19cf90a9c762 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1457,6 +1457,13 @@ xfs_create(
 	error = xfs_trans_reserve(tp, resblks, log_res, 0,
 			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
+		/* flush outstanding delalloc blocks and retry */
+		xfs_flush_inodes(dp);
+		error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	}
+	if (error == ENOSPC) {
+		/* No space at all so try a "no-allocation" reservation */
 		resblks = 0;
 		error = xfs_trans_reserve(tp, 0, log_res, 0,
 				XFS_TRANS_PERM_LOG_RES, log_count);