Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

author: Dmitry Torokhov <dtor_core@ameritech.net> 2006-06-26 01:31:38 -0400
committer: Dmitry Torokhov <dtor_core@ameritech.net> 2006-06-26 01:31:38 -0400
commit: 4854c7b27f0975a2b629f35ea3996d2968eb7c4f (patch)
tree: 4102bdb70289764a2058aff0f907b13d7cf0e0d1 /fs
parent: 3cbd5b32cb625f5c0f1b1476d154fac873dd49ce (diff)
parent: fcc18e83e1f6fd9fa6b333735bf0fcd530655511 (diff)
download: linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.tar.gz
linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.tar.bz2
linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.zip
327 files changed, 14698 insertions, 12917 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2cb87ba4b1c1..5c6bdf82146c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -530,9 +530,6 @@ error:
 	if (vfid)
 		v9fs_fid_destroy(vfid);
 
-	if (inode)
-		iput(inode);
-
 	return err;
 }
 
@@ -1054,6 +1051,9 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
 	int ret;
 	char *link = __getname();
 
+	if (unlikely(!link))
+		return -ENOMEM;
+
 	if (buflen > PATH_MAX)
 		buflen = PATH_MAX;
 
@@ -1171,9 +1171,6 @@ error:
 	if (vfid)
 		v9fs_fid_destroy(vfid);
 
-	if (inode)
-		iput(inode);
-
 	return err;
 
 }
@@ -1227,6 +1224,9 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	name = __getname();
+	if (unlikely(!name))
+		return -ENOMEM;
+
 	sprintf(name, "%d\n", oldfid->fid);
 	retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
 	__putname(name);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e3..8b15bb22caca 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -99,12 +99,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
  * @flags: mount flags
  * @dev_name: device name that was mounted
  * @data: mount options
+ * @mnt: mountpoint record to be instantiated
  *
  */
 
-static struct super_block *v9fs_get_sb(struct file_system_type
-				       *fs_type, int flags,
-				       const char *dev_name, void *data)
+static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data,
+		       struct vfsmount *mnt)
 {
 	struct super_block *sb = NULL;
 	struct v9fs_fcall *fcall = NULL;
@@ -123,17 +124,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		sb = ERR_PTR(newfid);
+		retval = newfid;
 		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-	if (IS_ERR(sb))
+	if (IS_ERR(sb)) {
+		retval = PTR_ERR(sb);
 		goto out_close_session;
+	}
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -184,19 +187,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		goto put_back_sb;
 	}
 
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
 out_close_session:
 	v9fs_session_close(v9ses);
 out_free_session:
 	kfree(v9ses);
-	return sb;
+	return retval;
 
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
-	return ERR_PTR(retval);
+	return retval;
 }
 
 /**
@@ -253,11 +256,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 
 static void
-v9fs_umount_begin(struct super_block *sb)
+v9fs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info;
 
-	v9fs_session_cancel(v9ses);
+	if (flags & MNT_FORCE)
+		v9fs_session_cancel(v9ses);
 }
 
 static struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2d..1cdc043922d5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -53,7 +53,7 @@ config EXT2_FS_SECURITY
 
 config EXT2_FS_XIP
 	bool "Ext2 execute in place support"
-	depends on EXT2_FS
+	depends on EXT2_FS && MMU
 	help
 	  Execute in place can be used on memory-backed block devices. If you
 	  enable this option, you can select to mount block devices which are
@@ -393,18 +393,30 @@ config INOTIFY
 	bool "Inotify file change notification support"
 	default y
 	---help---
-	  Say Y here to enable inotify support and the associated system
-	  calls.  Inotify is a file change notification system and a
-	  replacement for dnotify.  Inotify fixes numerous shortcomings in
-	  dnotify and introduces several new features.  It allows monitoring
-	  of both files and directories via a single open fd.  Other features
-	  include multiple file events, one-shot support, and unmount
+	  Say Y here to enable inotify support.  Inotify is a file change
+	  notification system and a replacement for dnotify.  Inotify fixes
+	  numerous shortcomings in dnotify and introduces several new features
+	  including multiple file events, one-shot support, and unmount
 	  notification.
 
 	  For more information, see Documentation/filesystems/inotify.txt
 
 	  If unsure, say Y.
 
+config INOTIFY_USER
+	bool "Inotify support for userspace"
+	depends on INOTIFY
+	default y
+	---help---
+	  Say Y here to enable inotify support for userspace, including the
+	  associated system calls.  Inotify allows monitoring of both files and
+	  directories via a single open fd.  Events are read from the file
+	  descriptor, which is also select()- and poll()-able.
+
+	  For more information, see Documentation/filesystems/inotify.txt
+
+	  If unsure, say Y.
+
 config QUOTA
 	bool "Quota support"
 	help
@@ -764,7 +776,8 @@ endmenu
 menu "Pseudo filesystems"
 
 config PROC_FS
-	bool "/proc file system support"
+	bool "/proc file system support" if EMBEDDED
+	default y
 	help
 	  This is a virtual file system providing information about the status
 	  of the system. "Virtual" means that it doesn't take up any space on
@@ -1101,6 +1114,44 @@ config JFFS2_SUMMARY
 
 	  If unsure, say 'N'.
 
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL && !JFFS2_FS_WRITEBUFFER
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+	  
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+	  
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+	  
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+	  
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
 config JFFS2_COMPRESSION_OPTIONS
 	bool "Advanced compression options for JFFS2"
 	depends on JFFS2_FS
@@ -1320,11 +1371,19 @@ config UFS_FS
 
 config UFS_FS_WRITE
 	bool "UFS file system write support (DANGEROUS)"
-	depends on UFS_FS && EXPERIMENTAL && BROKEN
+	depends on UFS_FS && EXPERIMENTAL
 	help
 	  Say Y here if you want to try writing to UFS partitions. This is
 	  experimental, so you should back up your UFS partitions beforehand.
 
+config UFS_DEBUG
+	bool "UFS debugging"
+	depends on UFS_FS
+	help
+	  If you are experiencing any problems with the UFS filesystem, say
+	  Y here.  This will result in _many_ additional debugging messages to be
+	  written to the system log.
+
 endmenu
 
 menu "Network File Systems"
diff --git a/fs/Makefile b/fs/Makefile
index 078d3d1191a5..d0ea6bfccf29 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,6 +13,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioprio.o pnode.o drop_caches.o splice.o sync.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 252abda0d200..ba1c88af49fe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -196,17 +196,17 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 	return parse_options(sb, data);
 }
 
-static int adfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct adfs_sb_info *asb = ADFS_SB(sb);
+	struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
 
 	buf->f_type    = ADFS_SUPER_MAGIC;
 	buf->f_namelen = asb->s_namelen;
-	buf->f_bsize   = sb->s_blocksize;
+	buf->f_bsize   = dentry->d_sb->s_blocksize;
 	buf->f_blocks  = asb->s_size;
 	buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
 	buf->f_bavail  =
-	buf->f_bfree   = adfs_map_free(sb);
+	buf->f_bfree   = adfs_map_free(dentry->d_sb);
 	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
 
 	return 0;
@@ -470,10 +470,11 @@ error:
 	return -EINVAL;
 }
 
-static struct super_block *adfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int adfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type adfs_fs_type = {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4d7e5b19e5cd..5200f4938df0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,7 +18,7 @@
 
 extern struct timezone sys_tz;
 
-static int affs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
@@ -271,6 +271,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	int			 reserved;
 	unsigned long		 mount_flags;
 	int			 tmp_flags;	/* fix remount prototype... */
+	u8			 sig[4];
 
 	pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
 
@@ -370,8 +371,9 @@ got_root:
 		printk(KERN_ERR "AFFS: Cannot read boot block\n");
 		goto out_error;
 	}
-	chksum = be32_to_cpu(*(__be32 *)boot_bh->b_data);
+	memcpy(sig, boot_bh->b_data, 4);
 	brelse(boot_bh);
+	chksum = be32_to_cpu(*(__be32 *)sig);
 
 	/* Dircache filesystems are compatible with non-dircache ones
 	 * when reading. As long as they aren't supported, writing is
@@ -420,11 +422,11 @@ got_root:
 	}
 
 	if (mount_flags & SF_VERBOSE) {
-		chksum = cpu_to_be32(chksum);
-		printk(KERN_NOTICE "AFFS: Mounting volume \"%*s\": Type=%.3s\\%c, Blocksize=%d\n",
-			AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0],
+		u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
+		printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
+			len > 31 ? 31 : len,
 			AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
-			(char *)&chksum,((char *)&chksum)[3] + '0',blocksize);
+			sig, sig[3] + '0', blocksize);
 	}
 
 	sb->s_flags |= MS_NODEV | MS_NOSUID;
@@ -508,8 +510,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-affs_statfs(struct super_block *sb, struct kstatfs *buf)
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int		 free;
 
 	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
@@ -524,10 +527,11 @@ affs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *affs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int affs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type affs_fs_type = {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a6dff6a4f204..2fc99877cb0d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -185,9 +185,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
 
 	_enter("{%lu},%lu", dir->i_ino, index);
 
-	page = read_cache_page(dir->i_mapping,index,
-			       (filler_t *) dir->i_mapping->a_ops->readpage,
-			       NULL);
+	page = read_mapping_page(dir->i_mapping, index, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83c..99785a79d043 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -63,7 +63,6 @@ unsigned long afs_mntpt_expiry_timeout = 20;
 int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 {
 	struct page *page;
-	filler_t *filler;
 	size_t size;
 	char *buf;
 	int ret;
@@ -71,10 +70,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 	_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
-	filler = (filler_t *) AFS_VNODE_TO_I(vnode)->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
-			       filler, NULL);
+	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto out;
@@ -160,7 +156,6 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	struct page *page = NULL;
 	size_t size;
 	char *buf, *devname = NULL, *options = NULL;
-	filler_t *filler;
 	int ret;
 
 	kenter("{%s}", mntpt->d_name.name);
@@ -182,9 +177,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		goto error;
 
 	/* read the contents of the AFS special symlink */
-	filler = (filler_t *)mntpt->d_inode->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(mntpt->d_inode->i_mapping, 0, filler, NULL);
+	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto error;
@@ -210,7 +203,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	kdebug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = do_kern_mount("afs", 0, devname, options);
+	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
 	kdebug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231ab..67d1f5c819ec 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -38,9 +38,9 @@ struct afs_mount_params {
 static void afs_i_init_once(void *foo, kmem_cache_t *cachep,
 			    unsigned long flags);
 
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name,
-				      void *data);
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name,
+		      void *data, struct vfsmount *mnt);
 
 static struct inode *afs_alloc_inode(struct super_block *sb);
 
@@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb);
 
 static void afs_destroy_inode(struct inode *inode);
 
-static struct file_system_type afs_fs_type = {
+struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
 	.get_sb		= afs_get_sb,
@@ -294,10 +294,11 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
  * get an AFS superblock
  * - TODO: don't use get_sb_nodev(), but rather call sget() directly
  */
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags,
-				      const char *dev_name,
-				      void *options)
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags,
+		      const char *dev_name,
+		      void *options,
+		      struct vfsmount *mnt)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
@@ -311,7 +312,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 	ret = afscm_start();
 	if (ret < 0) {
 		_leave(" = %d", ret);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	/* parse the options */
@@ -348,18 +349,19 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 		goto error;
 	}
 	sb->s_flags |= MS_ACTIVE;
+	simple_set_mnt(mnt, sb);
 
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
-	_leave(" = %p", sb);
-	return sb;
+	_leave(" = 0 [%p]", 0, sb);
+	return 0;
 
  error:
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
 	afscm_stop();
 	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	return ret;
 } /* end afs_get_sb() */
 
 /*****************************************************************************/
diff --git a/fs/afs/super.h b/fs/afs/super.h
index ac11362f4e95..32de8cc6fae8 100644
--- a/fs/afs/super.h
+++ b/fs/afs/super.h
@@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+extern struct file_system_type afs_fs_type;
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/aio.c b/fs/aio.c
index e41e932ba489..8c34a62df7d7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -777,11 +777,11 @@ out:
 static int __aio_run_iocbs(struct kioctx *ctx)
 {
 	struct kiocb *iocb;
-	LIST_HEAD(run_list);
+	struct list_head run_list;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	list_splice_init(&ctx->run_list, &run_list);
+	list_replace_init(&ctx->run_list, &run_list);
 	while (!list_empty(&run_list)) {
 		iocb = list_entry(run_list.next, struct kiocb,
 			ki_run_list);
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b977ece69f0c..aca123752406 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b8ce02607d66..4456d1daa40f 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -174,6 +174,12 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 			struct autofs_info *ino = autofs4_dentry_ino(p);
 			unsigned int ino_count = atomic_read(&ino->count);
 
+			/*
+			 * Clean stale dentries below that have not been
+			 * invalidated after a mount fail during lookup
+			 */
+			d_invalidate(p);
+
 			/* allow for dget above and top is already dgot */
 			if (p == top)
 				ino_count += 2;
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index acecec8578ce..5d9193332bef 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 68ebd10f345d..08201fab26cd 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -49,7 +49,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static void befs_put_super(struct super_block *);
 static int befs_remount(struct super_block *, int *, char *);
-static int befs_statfs(struct super_block *, struct kstatfs *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
 static int parse_options(char *, befs_mount_options *);
 
 static const struct super_operations befs_sops = {
@@ -880,8 +880,9 @@ befs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-befs_statfs(struct super_block *sb, struct kstatfs *buf)
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 
 	befs_debug(sb, "---> befs_statfs()");
 
@@ -899,11 +900,12 @@ befs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *
+static int
 befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-	    void *data)
+	    void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type befs_fs_type = {
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 55a7a78332f8..cf74f3d4d966 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -203,8 +203,9 @@ static void bfs_put_super(struct super_block *s)
 	s->s_fs_info = NULL;
 }
 
-static int bfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct bfs_sb_info *info = BFS_SB(s);
 	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
 	buf->f_type = BFS_MAGIC;
@@ -410,10 +411,10 @@ out:
 	return -EINVAL;
 }
 
-static struct super_block *bfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
 }
 
 static struct file_system_type bfs_fs_type = {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 537893a16014..d0434406eaeb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -38,15 +38,13 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/random.h>
-
+#include <linux/elf.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
-#include <linux/elf.h>
-
-static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs);
-static int load_elf_library(struct file*);
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+static int load_elf_library(struct file *);
 static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
 extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
 
@@ -59,15 +57,15 @@ extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
  * don't even try.
  */
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file);
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file);
 #else
 #define elf_core_dump	NULL
 #endif
 
 #if ELF_EXEC_PAGESIZE > PAGE_SIZE
-# define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
+#define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
 #else
-# define ELF_MIN_ALIGN	PAGE_SIZE
+#define ELF_MIN_ALIGN	PAGE_SIZE
 #endif
 
 #ifndef ELF_CORE_EFLAGS
@@ -86,7 +84,7 @@ static struct linux_binfmt elf_format = {
 		.min_coredump	= ELF_EXEC_PAGESIZE
 };
 
-#define BAD_ADDR(x)	((unsigned long)(x) > TASK_SIZE)
+#define BAD_ADDR(x) ((unsigned long)(x) > TASK_SIZE)
 
 static int set_brk(unsigned long start, unsigned long end)
 {
@@ -104,13 +102,11 @@ static int set_brk(unsigned long start, unsigned long end)
 	return 0;
 }
 
-
 /* We need to explicitly zero any fractional pages
    after the data section (i.e. bss).  This would
    contain the junk from the file that should not
-   be in memory */
-
-
+   be in memory
+ */
 static int padzero(unsigned long elf_bss)
 {
 	unsigned long nbyte;
@@ -129,7 +125,9 @@ static int padzero(unsigned long elf_bss)
 #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items))
 #define STACK_ROUND(sp, items) \
 	((15 + (unsigned long) ((sp) + (items))) &~ 15UL)
-#define STACK_ALLOC(sp, len) ({ elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; old_sp; })
+#define STACK_ALLOC(sp, len) ({ \
+	elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \
+	old_sp; })
 #else
 #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
 #define STACK_ROUND(sp, items) \
@@ -138,7 +136,7 @@ static int padzero(unsigned long elf_bss)
 #endif
 
 static int
-create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
+create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		int interp_aout, unsigned long load_addr,
 		unsigned long interp_load_addr)
 {
@@ -161,7 +159,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	 * for userspace to get any other way, in others (i386) it is
 	 * merely difficult.
 	 */
-
 	u_platform = NULL;
 	if (k_platform) {
 		size_t len = strlen(k_platform) + 1;
@@ -171,7 +168,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 		 * evictions by the processes running on the same package. One
 		 * thing we can do is to shuffle the initial stack for them.
 		 */
-	 
+
 		p = arch_align_stack(p);
 
 		u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
@@ -180,9 +177,12 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	}
 
 	/* Create the ELF interpreter info */
-	elf_info = (elf_addr_t *) current->mm->saved_auxv;
+	elf_info = (elf_addr_t *)current->mm->saved_auxv;
 #define NEW_AUX_ENT(id, val) \
-	do { elf_info[ei_index++] = id; elf_info[ei_index++] = val; } while (0)
+	do { \
+		elf_info[ei_index++] = id; \
+		elf_info[ei_index++] = val; \
+	} while (0)
 
 #ifdef ARCH_DLINFO
 	/* 
@@ -195,21 +195,22 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
 	NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
 	NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
-	NEW_AUX_ENT(AT_PHENT, sizeof (struct elf_phdr));
+	NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
 	NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, (elf_addr_t) tsk->uid);
-	NEW_AUX_ENT(AT_EUID, (elf_addr_t) tsk->euid);
-	NEW_AUX_ENT(AT_GID, (elf_addr_t) tsk->gid);
-	NEW_AUX_ENT(AT_EGID, (elf_addr_t) tsk->egid);
- 	NEW_AUX_ENT(AT_SECURE, (elf_addr_t) security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_UID, tsk->uid);
+	NEW_AUX_ENT(AT_EUID, tsk->euid);
+	NEW_AUX_ENT(AT_GID, tsk->gid);
+	NEW_AUX_ENT(AT_EGID, tsk->egid);
+ 	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	if (k_platform) {
-		NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform);
+		NEW_AUX_ENT(AT_PLATFORM,
+			    (elf_addr_t)(unsigned long)u_platform);
 	}
 	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
-		NEW_AUX_ENT(AT_EXECFD, (elf_addr_t) bprm->interp_data);
+		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
 	}
 #undef NEW_AUX_ENT
 	/* AT_NULL is zero; clear the rest too */
@@ -232,7 +233,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	/* Point sp at the lowest address on the stack */
 #ifdef CONFIG_STACK_GROWSUP
 	sp = (elf_addr_t __user *)bprm->p - items - ei_index;
-	bprm->exec = (unsigned long) sp; /* XXX: PARISC HACK */
+	bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */
 #else
 	sp = (elf_addr_t __user *)bprm->p;
 #endif
@@ -285,7 +286,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 #ifndef elf_map
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-			struct elf_phdr *eppnt, int prot, int type)
+		struct elf_phdr *eppnt, int prot, int type)
 {
 	unsigned long map_addr;
 	unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
@@ -310,9 +311,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
    is only provided so that we can read a.out libraries that have
    an ELF header */
 
-static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
-				     struct file * interpreter,
-				     unsigned long *interp_load_addr)
+static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
+		struct file *interpreter, unsigned long *interp_load_addr)
 {
 	struct elf_phdr *elf_phdata;
 	struct elf_phdr *eppnt;
@@ -342,15 +342,15 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 		goto out;
 
 	/* Now read in all of the header information */
-
 	size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum;
 	if (size > ELF_MIN_ALIGN)
 		goto out;
-	elf_phdata = (struct elf_phdr *) kmalloc(size, GFP_KERNEL);
+	elf_phdata = kmalloc(size, GFP_KERNEL);
 	if (!elf_phdata)
 		goto out;
 
-	retval = kernel_read(interpreter,interp_elf_ex->e_phoff,(char *)elf_phdata,size);
+	retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
+			     (char *)elf_phdata,size);
 	error = -EIO;
 	if (retval != size) {
 		if (retval < 0)
@@ -359,58 +359,65 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 	}
 
 	eppnt = elf_phdata;
-	for (i=0; i<interp_elf_ex->e_phnum; i++, eppnt++) {
-	  if (eppnt->p_type == PT_LOAD) {
-	    int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
-	    int elf_prot = 0;
-	    unsigned long vaddr = 0;
-	    unsigned long k, map_addr;
-
-	    if (eppnt->p_flags & PF_R) elf_prot =  PROT_READ;
-	    if (eppnt->p_flags & PF_W) elf_prot |= PROT_WRITE;
-	    if (eppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
-	    vaddr = eppnt->p_vaddr;
-	    if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
-	    	elf_type |= MAP_FIXED;
-
-	    map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type);
-	    error = map_addr;
-	    if (BAD_ADDR(map_addr))
-	    	goto out_close;
-
-	    if (!load_addr_set && interp_elf_ex->e_type == ET_DYN) {
-		load_addr = map_addr - ELF_PAGESTART(vaddr);
-		load_addr_set = 1;
-	    }
-
-	    /*
-	     * Check to see if the section's size will overflow the
-	     * allowed task size. Note that p_filesz must always be
-	     * <= p_memsize so it is only necessary to check p_memsz.
-	     */
-	    k = load_addr + eppnt->p_vaddr;
-	    if (k > TASK_SIZE || eppnt->p_filesz > eppnt->p_memsz ||
-		eppnt->p_memsz > TASK_SIZE || TASK_SIZE - eppnt->p_memsz < k) {
-	        error = -ENOMEM;
-		goto out_close;
-	    }
-
-	    /*
-	     * Find the end of the file mapping for this phdr, and keep
-	     * track of the largest address we see for this.
-	     */
-	    k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
-	    if (k > elf_bss)
-		elf_bss = k;
-
-	    /*
-	     * Do the same thing for the memory mapping - between
-	     * elf_bss and last_bss is the bss section.
-	     */
-	    k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
-	    if (k > last_bss)
-		last_bss = k;
-	  }
+	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+		if (eppnt->p_type == PT_LOAD) {
+			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+			int elf_prot = 0;
+			unsigned long vaddr = 0;
+			unsigned long k, map_addr;
+
+			if (eppnt->p_flags & PF_R)
+		    		elf_prot = PROT_READ;
+			if (eppnt->p_flags & PF_W)
+				elf_prot |= PROT_WRITE;
+			if (eppnt->p_flags & PF_X)
+				elf_prot |= PROT_EXEC;
+			vaddr = eppnt->p_vaddr;
+			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
+				elf_type |= MAP_FIXED;
+
+			map_addr = elf_map(interpreter, load_addr + vaddr,
+					   eppnt, elf_prot, elf_type);
+			error = map_addr;
+			if (BAD_ADDR(map_addr))
+				goto out_close;
+
+			if (!load_addr_set &&
+			    interp_elf_ex->e_type == ET_DYN) {
+				load_addr = map_addr - ELF_PAGESTART(vaddr);
+				load_addr_set = 1;
+			}
+
+			/*
+			 * Check to see if the section's size will overflow the
+			 * allowed task size. Note that p_filesz must always be
+			 * <= p_memsize so it's only necessary to check p_memsz.
+			 */
+			k = load_addr + eppnt->p_vaddr;
+			if (k > TASK_SIZE ||
+			    eppnt->p_filesz > eppnt->p_memsz ||
+			    eppnt->p_memsz > TASK_SIZE ||
+			    TASK_SIZE - eppnt->p_memsz < k) {
+				error = -ENOMEM;
+				goto out_close;
+			}
+
+			/*
+			 * Find the end of the file mapping for this phdr, and
+			 * keep track of the largest address we see for this.
+			 */
+			k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
+			if (k > elf_bss)
+				elf_bss = k;
+
+			/*
+			 * Do the same thing for the memory mapping - between
+			 * elf_bss and last_bss is the bss section.
+			 */
+			k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+			if (k > last_bss)
+				last_bss = k;
+		}
 	}
 
 	/*
@@ -424,7 +431,8 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 		goto out_close;
 	}
 
-	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);	/* What we have mapped so far */
+	/* What we have mapped so far */
+	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
 	/* Map the last of the bss segment */
 	if (last_bss > elf_bss) {
@@ -436,7 +444,7 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 	}
 
 	*interp_load_addr = load_addr;
-	error = ((unsigned long) interp_elf_ex->e_entry) + load_addr;
+	error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
 
 out_close:
 	kfree(elf_phdata);
@@ -444,8 +452,8 @@ out:
 	return error;
 }
 
-static unsigned long load_aout_interp(struct exec * interp_ex,
-			     struct file * interpreter)
+static unsigned long load_aout_interp(struct exec *interp_ex,
+		struct file *interpreter)
 {
 	unsigned long text_data, elf_entry = ~0UL;
 	char __user * addr;
@@ -464,7 +472,7 @@ static unsigned long load_aout_interp(struct exec * interp_ex,
 	case ZMAGIC:
 	case QMAGIC:
 		offset = N_TXTOFF(*interp_ex);
-		addr = (char __user *) N_TXTADDR(*interp_ex);
+		addr = (char __user *)N_TXTADDR(*interp_ex);
 		break;
 	default:
 		goto out;
@@ -480,7 +488,6 @@ static unsigned long load_aout_interp(struct exec * interp_ex,
 	flush_icache_range((unsigned long)addr,
 	                   (unsigned long)addr + text_data);
 
-
 	down_write(&current->mm->mmap_sem);	
 	do_brk(ELF_PAGESTART(text_data + ELF_MIN_ALIGN - 1),
 		interp_ex->a_bss);
@@ -519,7 +526,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
 
-static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
  	unsigned long load_addr = 0, load_bias = 0;
@@ -528,7 +535,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	unsigned int interpreter_type = INTERPRETER_NONE;
 	unsigned char ibcs2_interpreter = 0;
 	unsigned long error;
-	struct elf_phdr * elf_ppnt, *elf_phdata;
+	struct elf_phdr *elf_ppnt, *elf_phdata;
 	unsigned long elf_bss, elf_brk;
 	int elf_exec_fileno;
 	int retval, i;
@@ -553,7 +560,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	}
 	
 	/* Get the exec-header */
-	loc->elf_ex = *((struct elfhdr *) bprm->buf);
+	loc->elf_ex = *((struct elfhdr *)bprm->buf);
 
 	retval = -ENOEXEC;
 	/* First of all, some simple consistency checks */
@@ -568,7 +575,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out;
 
 	/* Now read in all of the header information */
-
 	if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
 		goto out;
 	if (loc->elf_ex.e_phnum < 1 ||
@@ -576,18 +582,19 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out;
 	size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
 	retval = -ENOMEM;
-	elf_phdata = (struct elf_phdr *) kmalloc(size, GFP_KERNEL);
+	elf_phdata = kmalloc(size, GFP_KERNEL);
 	if (!elf_phdata)
 		goto out;
 
-	retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, (char *) elf_phdata, size);
+	retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
+			     (char *)elf_phdata, size);
 	if (retval != size) {
 		if (retval >= 0)
 			retval = -EIO;
 		goto out_free_ph;
 	}
 
-	files = current->files;		/* Refcounted so ok */
+	files = current->files;	/* Refcounted so ok */
 	retval = unshare_files();
 	if (retval < 0)
 		goto out_free_ph;
@@ -598,7 +605,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* exec will make our files private anyway, but for the a.out
 	   loader stuff we need to do it earlier */
-
 	retval = get_unused_fd();
 	if (retval < 0)
 		goto out_free_fh;
@@ -620,7 +626,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			 * shared libraries - for now assume that this
 			 * is an a.out format binary
 			 */
-
 			retval = -ENOEXEC;
 			if (elf_ppnt->p_filesz > PATH_MAX || 
 			    elf_ppnt->p_filesz < 2)
@@ -628,13 +633,13 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 			retval = -ENOMEM;
 			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
-							   GFP_KERNEL);
+						  GFP_KERNEL);
 			if (!elf_interpreter)
 				goto out_free_file;
 
 			retval = kernel_read(bprm->file, elf_ppnt->p_offset,
-					   elf_interpreter,
-					   elf_ppnt->p_filesz);
+					     elf_interpreter,
+					     elf_ppnt->p_filesz);
 			if (retval != elf_ppnt->p_filesz) {
 				if (retval >= 0)
 					retval = -EIO;
@@ -678,7 +683,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			retval = PTR_ERR(interpreter);
 			if (IS_ERR(interpreter))
 				goto out_free_interp;
-			retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE);
+			retval = kernel_read(interpreter, 0, bprm->buf,
+					     BINPRM_BUF_SIZE);
 			if (retval != BINPRM_BUF_SIZE) {
 				if (retval >= 0)
 					retval = -EIO;
@@ -686,8 +692,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			}
 
 			/* Get the exec headers */
-			loc->interp_ex = *((struct exec *) bprm->buf);
-			loc->interp_elf_ex = *((struct elfhdr *) bprm->buf);
+			loc->interp_ex = *((struct exec *)bprm->buf);
+			loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
 			break;
 		}
 		elf_ppnt++;
@@ -739,7 +745,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* OK, we are done with that, now set up the arg stuff,
 	   and then start this sucker up */
-
 	if ((!bprm->sh_bang) && (interpreter_type == INTERPRETER_AOUT)) {
 		char *passed_p = passed_fileno;
 		sprintf(passed_fileno, "%d", elf_exec_fileno);
@@ -759,7 +764,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Discard our unneeded old files struct */
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
@@ -778,7 +782,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
-	if ( !(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		current->flags |= PF_RANDOMIZE;
 	arch_pick_mmap_layout(current->mm);
 
@@ -799,8 +803,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	   the correct location in memory.  At this point, we assume that
 	   the image should be loaded at fixed address, not at a variable
 	   address. */
-
-	for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+	for(i = 0, elf_ppnt = elf_phdata;
+	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
 		int elf_prot = 0, elf_flags;
 		unsigned long k, vaddr;
 
@@ -828,30 +832,35 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 							load_bias, nbyte)) {
 					/*
 					 * This bss-zeroing can fail if the ELF
-					 * file specifies odd protections.  So
+					 * file specifies odd protections. So
 					 * we don't check the return value
 					 */
 				}
 			}
 		}
 
-		if (elf_ppnt->p_flags & PF_R) elf_prot |= PROT_READ;
-		if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE;
-		if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
+		if (elf_ppnt->p_flags & PF_R)
+			elf_prot |= PROT_READ;
+		if (elf_ppnt->p_flags & PF_W)
+			elf_prot |= PROT_WRITE;
+		if (elf_ppnt->p_flags & PF_X)
+			elf_prot |= PROT_EXEC;
 
-		elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
+		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
 
 		vaddr = elf_ppnt->p_vaddr;
 		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
 			elf_flags |= MAP_FIXED;
 		} else if (loc->elf_ex.e_type == ET_DYN) {
-			/* Try and get dynamic programs out of the way of the default mmap
-			   base, as well as whatever program they might try to exec.  This
-			   is because the brk will follow the loader, and is not movable.  */
+			/* Try and get dynamic programs out of the way of the
+			 * default mmap base, as well as whatever program they
+			 * might try to exec.  This is because the brk will
+			 * follow the loader, and is not movable.  */
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 		}
 
-		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags);
+		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+				elf_prot, elf_flags);
 		if (BAD_ADDR(error)) {
 			send_sig(SIGKILL, current, 0);
 			goto out_free_dentry;
@@ -868,8 +877,10 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			}
 		}
 		k = elf_ppnt->p_vaddr;
-		if (k < start_code) start_code = k;
-		if (start_data < k) start_data = k;
+		if (k < start_code)
+			start_code = k;
+		if (start_data < k)
+			start_data = k;
 
 		/*
 		 * Check to see if the section's size will overflow the
@@ -879,7 +890,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		if (k > TASK_SIZE || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
 		    elf_ppnt->p_memsz > TASK_SIZE ||
 		    TASK_SIZE - elf_ppnt->p_memsz < k) {
-			/* set_brk can never work.  Avoid overflows.  */
+			/* set_brk can never work. Avoid overflows. */
 			send_sig(SIGKILL, current, 0);
 			goto out_free_dentry;
 		}
@@ -967,8 +978,9 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	compute_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
-	create_elf_tables(bprm, &loc->elf_ex, (interpreter_type == INTERPRETER_AOUT),
-			load_addr, interp_load_addr);
+	create_elf_tables(bprm, &loc->elf_ex,
+			  (interpreter_type == INTERPRETER_AOUT),
+			  load_addr, interp_load_addr);
 	/* N.B. passed_fileno might not be initialized? */
 	if (interpreter_type == INTERPRETER_AOUT)
 		current->mm->arg_start += strlen(passed_fileno) + 1;
@@ -982,7 +994,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
 		   and some applications "depend" upon this behavior.
 		   Since we do not have the power to recompile these, we
-		   emulate the SVr4 behavior.  Sigh.  */
+		   emulate the SVr4 behavior. Sigh. */
 		down_write(&current->mm->mmap_sem);
 		error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
 				MAP_FIXED | MAP_PRIVATE, 0);
@@ -1037,7 +1049,6 @@ out_free_ph:
 
 /* This is really simpleminded and specialized - we are loading an
    a.out library that is given an ELF header. */
-
 static int load_elf_library(struct file *file)
 {
 	struct elf_phdr *elf_phdata;
@@ -1047,7 +1058,7 @@ static int load_elf_library(struct file *file)
 	struct elfhdr elf_ex;
 
 	error = -ENOEXEC;
-	retval = kernel_read(file, 0, (char *) &elf_ex, sizeof(elf_ex));
+	retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex));
 	if (retval != sizeof(elf_ex))
 		goto out;
 
@@ -1056,7 +1067,7 @@ static int load_elf_library(struct file *file)
 
 	/* First of all, some simple consistency checks */
 	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
-	   !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
+	    !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
@@ -1104,7 +1115,8 @@ static int load_elf_library(struct file *file)
 		goto out_free_ph;
 	}
 
-	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1);
+	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
+			    ELF_MIN_ALIGN - 1);
 	bss = eppnt->p_memsz + eppnt->p_vaddr;
 	if (bss > len) {
 		down_write(&current->mm->mmap_sem);
@@ -1163,7 +1175,7 @@ static int maydump(struct vm_area_struct *vma)
 	if (vma->vm_flags & (VM_IO | VM_RESERVED))
 		return 0;
 
-	/* Dump shared memory only if mapped from an anonymous file.  */
+	/* Dump shared memory only if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED)
 		return vma->vm_file->f_dentry->d_inode->i_nlink == 0;
 
@@ -1174,7 +1186,7 @@ static int maydump(struct vm_area_struct *vma)
 	return 1;
 }
 
-#define roundup(x, y)  ((((x)+((y)-1))/(y))*(y))
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
 
 /* An ELF note in memory */
 struct memelfnote
@@ -1277,11 +1289,11 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
 }
 
 /*
- * fill up all the fields in prstatus from the given task struct, except registers
- * which need to be filled up separately.
+ * fill up all the fields in prstatus from the given task struct, except
+ * registers which need to be filled up separately.
  */
 static void fill_prstatus(struct elf_prstatus *prstatus,
-			struct task_struct *p, long signr) 
+		struct task_struct *p, long signr)
 {
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
@@ -1366,8 +1378,8 @@ struct elf_thread_status
 
 /*
  * In order to add the specific thread information for the elf file format,
- * we need to keep a linked list of every threads pr_status and then
- * create a single section for them in the final core file.
+ * we need to keep a linked list of every threads pr_status and then create
+ * a single section for them in the final core file.
  */
 static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 {
@@ -1378,19 +1390,23 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 	fill_prstatus(&t->prstatus, p, signr);
 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
 	
-	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus));
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+		  &(t->prstatus));
 	t->num_notes++;
 	sz += notesize(&t->notes[0]);
 
-	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu))) {
-		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), &(t->fpu));
+	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
+								&t->fpu))) {
+		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+			  &(t->fpu));
 		t->num_notes++;
 		sz += notesize(&t->notes[1]);
 	}
 
 #ifdef ELF_CORE_COPY_XFPREGS
 	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
-		fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), &t->xfpu);
+		fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu),
+			  &t->xfpu);
 		t->num_notes++;
 		sz += notesize(&t->notes[2]);
 	}
@@ -1405,7 +1421,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
  * and then they are actually written out.  If we run out of core limit
  * we just truncate.
  */
-static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 {
 #define	NUM_NOTES	6
 	int has_dumped = 0;
@@ -1434,12 +1450,12 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 	/*
 	 * We no longer stop all VM operations.
 	 * 
-	 * This is because those proceses that could possibly change map_count or
-	 * the mmap / vma pages are now blocked in do_exit on current finishing
-	 * this core dump.
+	 * This is because those proceses that could possibly change map_count
+	 * or the mmap / vma pages are now blocked in do_exit on current
+	 * finishing this core dump.
 	 *
 	 * Only ptrace can touch these memory addresses, but it doesn't change
-	 * the map_count or the pages allocated.  So no possibility of crashing
+	 * the map_count or the pages allocated. So no possibility of crashing
 	 * exists while dumping the mm->vm_next areas to the core file.
 	 */
   
@@ -1501,7 +1517,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 #endif
 
 	/* Set up header */
-	fill_elf_header(elf, segs+1);	/* including notes section */
+	fill_elf_header(elf, segs + 1);	/* including notes section */
 
 	has_dumped = 1;
 	current->flags |= PF_DUMPCORE;
@@ -1511,24 +1527,24 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 	 * with info from their /proc.
 	 */
 
-	fill_note(notes +0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
-	
+	fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
 	fill_psinfo(psinfo, current->group_leader, current->mm);
-	fill_note(notes +1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+	fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
 	
 	numnote = 2;
 
-	auxv = (elf_addr_t *) current->mm->saved_auxv;
+	auxv = (elf_addr_t *)current->mm->saved_auxv;
 
 	i = 0;
 	do
 		i += 2;
 	while (auxv[i - 2] != AT_NULL);
 	fill_note(&notes[numnote++], "CORE", NT_AUXV,
-		  i * sizeof (elf_addr_t), auxv);
+		  i * sizeof(elf_addr_t), auxv);
 
   	/* Try to dump the FPU. */
-	if ((prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, fpu)))
+	if ((prstatus->pr_fpvalid =
+	     elf_core_copy_task_fpregs(current, regs, fpu)))
 		fill_note(notes + numnote++,
 			  "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
 #ifdef ELF_CORE_COPY_XFPREGS
@@ -1577,8 +1593,10 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 		phdr.p_memsz = sz;
 		offset += phdr.p_filesz;
 		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
-		if (vma->vm_flags & VM_WRITE) phdr.p_flags |= PF_W;
-		if (vma->vm_flags & VM_EXEC) phdr.p_flags |= PF_X;
+		if (vma->vm_flags & VM_WRITE)
+			phdr.p_flags |= PF_W;
+		if (vma->vm_flags & VM_EXEC)
+			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
 		DUMP_WRITE(&phdr, sizeof(phdr));
@@ -1595,7 +1613,9 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 
 	/* write out the thread status notes section */
 	list_for_each(t, &thread_list) {
-		struct elf_thread_status *tmp = list_entry(t, struct elf_thread_status, list);
+		struct elf_thread_status *tmp =
+				list_entry(t, struct elf_thread_status, list);
+
 		for (i = 0; i < tmp->num_notes; i++)
 			if (!writenote(&tmp->notes[i], file))
 				goto end_coredump;
@@ -1612,18 +1632,19 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 		for (addr = vma->vm_start;
 		     addr < vma->vm_end;
 		     addr += PAGE_SIZE) {
-			struct page* page;
+			struct page *page;
 			struct vm_area_struct *vma;
 
 			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
 						&page, &vma) <= 0) {
-				DUMP_SEEK (file->f_pos + PAGE_SIZE);
+				DUMP_SEEK(file->f_pos + PAGE_SIZE);
 			} else {
 				if (page == ZERO_PAGE(addr)) {
-					DUMP_SEEK (file->f_pos + PAGE_SIZE);
+					DUMP_SEEK(file->f_pos + PAGE_SIZE);
 				} else {
 					void *kaddr;
-					flush_cache_page(vma, addr, page_to_pfn(page));
+					flush_cache_page(vma, addr,
+							 page_to_pfn(page));
 					kaddr = kmap(page);
 					if ((size += PAGE_SIZE) > limit ||
 					    !dump_write(file, kaddr,
@@ -1645,7 +1666,8 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 
 	if ((off_t)file->f_pos != offset) {
 		/* Sanity check */
-		printk(KERN_WARNING "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
+		printk(KERN_WARNING
+		       "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
 		       (off_t)file->f_pos, offset);
 	}
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a2e48c999c24..eba4e23b9ca0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -435,9 +435,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 				   struct elf_fdpic_params *interp_params)
 {
 	unsigned long sp, csp, nitems;
-	elf_caddr_t *argv, *envp;
+	elf_caddr_t __user *argv, *envp;
 	size_t platform_len = 0, len;
-	char *k_platform, *u_platform, *p;
+	char *k_platform;
+	char __user *u_platform, *p;
 	long hwcap;
 	int loop;
 
@@ -462,12 +463,11 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	if (k_platform) {
 		platform_len = strlen(k_platform) + 1;
 		sp -= platform_len;
+		u_platform = (char __user *) sp;
 		if (__copy_to_user(u_platform, k_platform, platform_len) != 0)
 			return -EFAULT;
 	}
 
-	u_platform = (char *) sp;
-
 #if defined(__i386__) && defined(CONFIG_SMP)
 	/* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
 	 * by the processes running on the same package. One thing we can do
@@ -490,7 +490,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	sp = (sp - len) & ~7UL;
 	exec_params->map_addr = sp;
 
-	if (copy_to_user((void *) sp, exec_params->loadmap, len) != 0)
+	if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0)
 		return -EFAULT;
 
 	current->mm->context.exec_fdpic_loadmap = (unsigned long) sp;
@@ -501,7 +501,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 		sp = (sp - len) & ~7UL;
 		interp_params->map_addr = sp;
 
-		if (copy_to_user((void *) sp, interp_params->loadmap, len) != 0)
+		if (copy_to_user((void __user *) sp, interp_params->loadmap, len) != 0)
 			return -EFAULT;
 
 		current->mm->context.interp_fdpic_loadmap = (unsigned long) sp;
@@ -527,7 +527,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	/* put the ELF interpreter info on the stack */
 #define NEW_AUX_ENT(nr, id, val)						\
 	do {									\
-		struct { unsigned long _id, _val; } *ent = (void *) csp;	\
+		struct { unsigned long _id, _val; } __user *ent = (void __user *) csp;	\
 		__put_user((id), &ent[nr]._id);					\
 		__put_user((val), &ent[nr]._val);				\
 	} while (0)
@@ -564,13 +564,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 
 	/* allocate room for argv[] and envv[] */
 	csp -= (bprm->envc + 1) * sizeof(elf_caddr_t);
-	envp = (elf_caddr_t *) csp;
+	envp = (elf_caddr_t __user *) csp;
 	csp -= (bprm->argc + 1) * sizeof(elf_caddr_t);
-	argv = (elf_caddr_t *) csp;
+	argv = (elf_caddr_t __user *) csp;
 
 	/* stack argc */
 	csp -= sizeof(unsigned long);
-	__put_user(bprm->argc, (unsigned long *) csp);
+	__put_user(bprm->argc, (unsigned long __user *) csp);
 
 	BUG_ON(csp != sp);
 
@@ -581,7 +581,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p);
 #endif
 
-	p = (char *) current->mm->arg_start;
+	p = (char __user *) current->mm->arg_start;
 	for (loop = bprm->argc; loop > 0; loop--) {
 		__put_user((elf_caddr_t) p, argv++);
 		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
@@ -1025,7 +1025,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		/* clear the bit between beginning of mapping and beginning of PT_LOAD */
 		if (prot & PROT_WRITE && disp > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-			clear_user((void *) maddr, disp);
+			clear_user((void __user *) maddr, disp);
 			maddr += disp;
 		}
 
@@ -1059,7 +1059,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		if (prot & PROT_WRITE && excess1 > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess1);
-			clear_user((void *) maddr + phdr->p_filesz, excess1);
+			clear_user((void __user *) maddr + phdr->p_filesz, excess1);
 		}
 
 #else
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b1c902e319c1..c94d52eafd1b 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		}
 
 		/* OK, This is the point of no return */
-		set_personality(PER_LINUX);
+		set_personality(PER_LINUX_32BIT);
 	}
 
 	/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a39..34ebbc191e46 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -55,6 +55,7 @@ typedef struct {
 } Node;
 
 static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
 static struct vfsmount *bm_mnt;
 static int entry_count;
 
@@ -203,7 +204,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _error;
 
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
@@ -638,7 +638,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count);
+	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
 	if (err) {
 		iput(inode);
 		inode = NULL;
@@ -740,10 +740,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 	return err;
 }
 
-static struct super_block *bm_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bm_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, bm_fill_super);
+	return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
 }
 
 static struct linux_binfmt misc_format = {
diff --git a/fs/bio.c b/fs/bio.c
index 098c12b2d60a..6a0b9ad8f8c9 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -654,9 +654,10 @@ static struct bio *__bio_map_user_iov(request_queue_t *q,
 				     write_to_vm, 0, &pages[cur_page], NULL);
 		up_read(&current->mm->mmap_sem);
 
-		if (ret < local_nr_pages)
+		if (ret < local_nr_pages) {
+			ret = -EFAULT;
 			goto out_unmap;
-
+		}
 
 		offset = uaddr & ~PAGE_MASK;
 		for (j = cur_page; j < page_limit; j++) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f5958f413bd1..028d9fb9c2d5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -300,10 +300,10 @@ static struct super_operations bdev_sops = {
 	.clear_inode = bdev_clear_inode,
 };
 
-static struct super_block *bd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 }
 
 static struct file_system_type bd_type = {
@@ -414,21 +414,31 @@ EXPORT_SYMBOL(bdput);
 static struct block_device *bd_acquire(struct inode *inode)
 {
 	struct block_device *bdev;
+
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
-	if (bdev && igrab(bdev->bd_inode)) {
+	if (bdev) {
+		atomic_inc(&bdev->bd_inode->i_count);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
 	spin_unlock(&bdev_lock);
+
 	bdev = bdget(inode->i_rdev);
 	if (bdev) {
 		spin_lock(&bdev_lock);
-		if (inode->i_bdev)
-			__bd_forget(inode);
-		inode->i_bdev = bdev;
-		inode->i_mapping = bdev->bd_inode->i_mapping;
-		list_add(&inode->i_devices, &bdev->bd_inodes);
+		if (!inode->i_bdev) {
+			/*
+			 * We take an additional bd_inode->i_count for inode,
+			 * and it's released in clear_inode() of inode.
+			 * So, we can access it via ->i_mapping always
+			 * without igrab().
+			 */
+			atomic_inc(&bdev->bd_inode->i_count);
+			inode->i_bdev = bdev;
+			inode->i_mapping = bdev->bd_inode->i_mapping;
+			list_add(&inode->i_devices, &bdev->bd_inodes);
+		}
 		spin_unlock(&bdev_lock);
 	}
 	return bdev;
@@ -438,10 +448,18 @@ static struct block_device *bd_acquire(struct inode *inode)
 
 void bd_forget(struct inode *inode)
 {
+	struct block_device *bdev = NULL;
+
 	spin_lock(&bdev_lock);
-	if (inode->i_bdev)
+	if (inode->i_bdev) {
+		if (inode->i_sb != blockdev_superblock)
+			bdev = inode->i_bdev;
 		__bd_forget(inode);
+	}
 	spin_unlock(&bdev_lock);
+
+	if (bdev)
+		iput(bdev->bd_inode);
 }
 
 int bd_claim(struct block_device *bdev, void *holder)
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f1f3a68077..373bb6292bdc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
 		goto out;
 	}
 
-	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 
 	/*
@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
-	current->flags &= ~PF_SYNCWRITE;
 out:
 	return ret;
 }
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 1a27ecb46c9a..7271bb0257f6 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,10 @@
+Version 1.43
+------------
+POSIX locking to servers which support CIFS POSIX Extensions
+(disabled by default controlled by proc/fs/cifs/Experimental).
+Handle conversion of long share names (especially Asian languages)
+to Unicode during mount. 
+
 Version 1.42
 ------------
 Fix slow oplock break when mounted to different servers at the same time and
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce9..8b4de6eaabd0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -166,8 +166,9 @@ cifs_put_super(struct super_block *sb)
 }
 
 static int
-cifs_statfs(struct super_block *sb, struct kstatfs *buf)
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int xid; 
 	int rc = -EOPNOTSUPP;
 	struct cifs_sb_info *cifs_sb;
@@ -402,12 +403,14 @@ static struct quotactl_ops cifs_quotactl_ops = {
 #endif
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-static void cifs_umount_begin(struct super_block * sblock)
+static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo * tcon;
 
-	cifs_sb = CIFS_SB(sblock);
+	if (!(flags & MNT_FORCE))
+		return;
+	cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
 	if(cifs_sb == NULL)
 		return;
 
@@ -460,9 +463,9 @@ struct super_operations cifs_super_ops = {
 	.remount_fs = cifs_remount,
 };
 
-static struct super_block *
+static int
 cifs_get_sb(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data)
+	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	int rc;
 	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
@@ -470,7 +473,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 	cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
 
 	if (IS_ERR(sb))
-		return sb;
+		return PTR_ERR(sb);
 
 	sb->s_flags = flags;
 
@@ -478,10 +481,10 @@ cifs_get_sb(struct file_system_type *fs_type,
 	if (rc) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(rc);
+		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 }
 
 static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 4e829dc672a6..d56c0577c710 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -74,7 +74,7 @@ extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t * poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, struct dentry *, int);
-extern int cifs_flush(struct file *);
+extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
@@ -99,5 +99,5 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern int cifs_ioctl (struct inode * inode, struct file * filep,
 		       unsigned int command, unsigned long arg);
-#define CIFS_VERSION   "1.42"
+#define CIFS_VERSION   "1.43"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2879ba343ca7..310ea2f0e0bf 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -267,7 +267,7 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 			const int waitFlag);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 smb_file_id, const int get_flag,
-			const __u64 len, const __u64 offset, 
+			const __u64 len, struct file_lock *, 
 			const __u16 lock_type, const int waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index fd36892eda55..925881e00ff2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1355,7 +1355,8 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 int
 CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 		const __u16 smb_file_id, const int get_flag, const __u64 len,
-		const __u64 lkoffset, const __u16 lock_type, const int waitFlag)
+		struct file_lock *pLockData, const __u16 lock_type, 
+		const int waitFlag)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
 	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -1366,6 +1367,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, param_offset, offset, byte_count, count;
 
 	cFYI(1, ("Posix Lock"));
+
+	if(pLockData == NULL)
+		return EINVAL;
+
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -1404,10 +1409,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 
 	parm_data->lock_type = cpu_to_le16(lock_type);
 	if(waitFlag)
-		parm_data->lock_flags = 1;
+		parm_data->lock_flags = cpu_to_le16(1);
 	parm_data->pid = cpu_to_le32(current->tgid);
-	parm_data->start = lkoffset;
-	parm_data->length = len;  /* normalize negative numbers */
+	parm_data->start = cpu_to_le64(pLockData->fl_start);
+	parm_data->length = cpu_to_le64(len);  /* normalize negative numbers */
 
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->Fid = smb_file_id;
@@ -1419,8 +1424,33 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
 		cFYI(1, ("Send error in Posix Lock = %d", rc));
-	}
+	} else if (get_flag) {
+		/* lock structure can be returned on get */
+		__u16 data_offset;
+		__u16 data_count;
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
+		if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) {
+			rc = -EIO;      /* bad smb */
+			goto plk_err_exit;
+		}
+		if(pLockData == NULL) {
+			rc = -EINVAL;
+			goto plk_err_exit;
+		}
+		data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+		data_count  = le16_to_cpu(pSMBr->t2.DataCount);
+		if(data_count < sizeof(struct cifs_posix_lock)) {
+			rc = -EIO;
+			goto plk_err_exit;
+		}
+		parm_data = (struct cifs_posix_lock *)
+			((char *)&pSMBr->hdr.Protocol + data_offset);
+		if(parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
+			pLockData->fl_type = F_UNLCK;
+	}
+ 
+plk_err_exit:
 	if (pSMB)
 		cifs_small_buf_release(pSMB);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d2ec806a4f32..bae1479318d1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2148,6 +2148,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
+				if(ses->serverOS)
+					kfree(ses->serverOS);
 				ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL);
 				if(ses->serverOS == NULL)
 					goto sesssetup_nomem;
@@ -2160,6 +2162,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				if (remaining_words > 0) {
 					len = UniStrnlen((wchar_t *)bcc_ptr,
 							 remaining_words-1);
+					if(ses->serverNOS)
+						kfree(ses->serverNOS);
 					ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
@@ -2177,6 +2181,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					if (remaining_words > 0) {
 						len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
           /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(2*(len+1),GFP_KERNEL);
 						if(ses->serverDomain == NULL)
@@ -2187,15 +2193,22 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						ses->serverDomain[2*len] = 0;
 						ses->serverDomain[1+(2*len)] = 0;
 					} /* else no more room so create dummy domain string */
-					else
+					else {
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = 
 							kzalloc(2, GFP_KERNEL);
+					}
 				} else {	/* no room so create dummy domain and NOS string */
 					/* if these kcallocs fail not much we
 					   can do, but better to not fail the
 					   sesssetup itself */
+					if(ses->serverDomain)
+						kfree(ses->serverDomain);
 					ses->serverDomain =
 					    kzalloc(2, GFP_KERNEL);
+					if(ses->serverNOS)
+						kfree(ses->serverNOS);
 					ses->serverNOS =
 					    kzalloc(2, GFP_KERNEL);
 				}
@@ -2204,6 +2217,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				if (((long) bcc_ptr + len) - (long)
 				    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
+					if(ses->serverOS)
+						kfree(ses->serverOS);
 					ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverOS == NULL)
 						goto sesssetup_nomem;
@@ -2214,6 +2229,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					bcc_ptr++;
 
 					len = strnlen(bcc_ptr, 1024);
+					if(ses->serverNOS)
+						kfree(ses->serverNOS);
 					ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
@@ -2223,6 +2240,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					bcc_ptr++;
 
 					len = strnlen(bcc_ptr, 1024);
+					if(ses->serverDomain)
+						kfree(ses->serverDomain);
 					ses->serverDomain = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverDomain == NULL)
 						goto sesssetup_nomem;
@@ -2427,6 +2446,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
+					if(ses->serverOS)
+						kfree(ses->serverOS);
 					ses->serverOS =
 					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
@@ -2441,6 +2462,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						len = UniStrnlen((wchar_t *)bcc_ptr,
 								 remaining_words
 								 - 1);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -2454,7 +2477,9 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						remaining_words -= len + 1;
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
-                            /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
+                     /* last string not null terminated (e.g.Windows XP/2000) */
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL);
 							cifs_strfromUCS_le(ses->serverDomain,
 							     (__le16 *)bcc_ptr, 
@@ -2463,11 +2488,18 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 							ses->serverDomain[2*len] = 0;
 							ses->serverDomain[1+(2*len)] = 0;
 						} /* else no more room so create dummy domain string */
-						else
+						else {
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2,GFP_KERNEL);
-					} else {	/* no room so create dummy domain and NOS string */
+						}
+					} else {/* no room use dummy domain&NOS */
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(2, GFP_KERNEL);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
@@ -2476,6 +2508,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					if (((long) bcc_ptr + len) - (long)
 					    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
+						if(ses->serverOS)
+							kfree(ses->serverOS);
 						ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
 						strncpy(ses->serverOS, bcc_ptr, len);
 
@@ -2484,6 +2518,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);
 						bcc_ptr += len;
@@ -2491,6 +2527,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);
 						bcc_ptr += len;
@@ -2728,6 +2766,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 /* We look for obvious messed up bcc or strings in response so we do not go off
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
+					if(ses->serverOS)
+						kfree(ses->serverOS);
 					ses->serverOS =
 					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
@@ -2743,6 +2783,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 								 bcc_ptr,
 								 remaining_words
 								 - 1);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -2760,6 +2802,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
            /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2 *
 								    (len +
@@ -2777,13 +2821,20 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 								[1 + (2 * len)]
 							    = 0;
 						} /* else no more room so create dummy domain string */
-						else
+						else {
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2,
 								    GFP_KERNEL);
+						}
 					} else {	/* no room so create dummy domain and NOS string */
+						if(ses->serverDomain);
+							kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(2, GFP_KERNEL);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2, GFP_KERNEL);
 					}
@@ -2792,6 +2843,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 					if (((long) bcc_ptr + len) - (long)
 					    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
+						if(ses->serverOS)
+							kfree(ses->serverOS);
 						ses->serverOS =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -2803,6 +2856,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -2812,6 +2867,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -3116,6 +3173,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
+					if(ses->serverOS)
+						kfree(ses->serverOS);
 					ses->serverOS =
 					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
@@ -3131,6 +3190,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 								 bcc_ptr,
 								 remaining_words
 								 - 1);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -3147,6 +3208,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
      /* last string not always null terminated (e.g. for Windows XP & 2000) */
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2 *
 								    (len +
@@ -3172,10 +3235,17 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 									  len)]
 							    = 0;
 						} /* else no more room so create dummy domain string */
-						else
+						else {
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain = kzalloc(2,GFP_KERNEL);
+						}
 					} else {  /* no room so create dummy domain and NOS string */
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(2, GFP_KERNEL);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
@@ -3183,6 +3253,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					if (((long) bcc_ptr + len) - 
                         (long) pByteArea(smb_buffer_response) 
                             <= BCC(smb_buffer_response)) {
+						if(ses->serverOS)
+							kfree(ses->serverOS);
 						ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
 						strncpy(ses->serverOS,bcc_ptr, len);
 
@@ -3191,6 +3263,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(len+1,GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);	
 						bcc_ptr += len;
@@ -3198,6 +3272,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(len+1,GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);
 						bcc_ptr += len;
@@ -3282,7 +3358,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		bcc_ptr++; /* align */
 	}
 
-	if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if(ses->server->secMode & 
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	if (ses->capabilities & CAP_STATUS32) {
@@ -3294,8 +3371,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	if (ses->capabilities & CAP_UNICODE) {
 		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
 		length =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, tree, 100, nls_codepage);
-		bcc_ptr += 2 * length;	/* convert num of 16 bit words to bytes */
+		    cifs_strtoUCS((__le16 *) bcc_ptr, tree, 
+			6 /* max utf8 char length in bytes */ * 
+			(/* server len*/ + 256 /* share len */), nls_codepage);
+		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */
 		bcc_ptr += 2;	/* skip trailing null */
 	} else {		/* ASCII */
 		strcpy(bcc_ptr, tree);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e152bf6afa60..b4a18c1cab0a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -84,6 +84,8 @@ static inline int cifs_get_disposition(unsigned int flags)
 		return FILE_OVERWRITE_IF;
 	else if ((flags & O_CREAT) == O_CREAT)
 		return FILE_OPEN_IF;
+	else if ((flags & O_TRUNC) == O_TRUNC)
+		return FILE_OVERWRITE;
 	else
 		return FILE_OPEN;
 }
@@ -656,7 +658,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 			else
 				posix_lock_type = CIFS_WRLCK;
 			rc = CIFSSMBPosixLock(xid, pTcon, netfid, 1 /* get */,
-					length,	pfLock->fl_start,
+					length,	pfLock,
 					posix_lock_type, wait_flag);
 			FreeXid(xid);
 			return rc;
@@ -704,7 +706,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 			return -EOPNOTSUPP;
 		}
 		rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */,
-				      length, pfLock->fl_start,
+				      length, pfLock,
 				      posix_lock_type, wait_flag);
 	} else
 		rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start,
@@ -904,8 +906,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 				if (rc != 0)
 					break;
 			}
-			if(experimEnabled || (pTcon->ses->server->secMode & 
-			 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) {
+			if(experimEnabled || (pTcon->ses->server &&
+				((pTcon->ses->server->secMode & 
+				(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+				== 0))) {
 				struct kvec iov[2];
 				unsigned int len;
 
@@ -1075,9 +1079,9 @@ static int cifs_writepages(struct address_space *mapping,
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
 	int done = 0;
-	pgoff_t end = -1;
+	pgoff_t end;
 	pgoff_t index;
-	int is_range = 0;
+ 	int range_whole = 0;
 	struct kvec iov[32];
 	int len;
 	int n_iov = 0;
@@ -1118,16 +1122,14 @@ static int cifs_writepages(struct address_space *mapping,
 	xid = GetXid();
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE)
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
-	else {
-		index = 0;
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -1163,7 +1165,7 @@ retry:
 				break;
 			}
 
-			if (unlikely(is_range) && (page->index > end)) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				break;
@@ -1267,7 +1269,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 
 	FreeXid(xid);
@@ -1415,7 +1417,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
  * As file closes, flush all cached write data for this inode checking
  * for write behind errors.
  */
-int cifs_flush(struct file *file)
+int cifs_flush(struct file *file, fl_owner_t id)
 {
 	struct inode * inode = file->f_dentry->d_inode;
 	int rc = 0;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7c2642431fa5..cc66c681bd11 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -164,7 +164,7 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_flush(struct file *coda_file)
+int coda_flush(struct file *coda_file, fl_owner_t id)
 {
 	unsigned short flags = coda_file->f_flags & ~O_EXCL;
 	unsigned short coda_flags = coda_flags_to_cflags(flags);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index ada1a81df6bd..87f1dc8aa24b 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -36,7 +36,7 @@
 /* VFS super_block ops */
 static void coda_clear_inode(struct inode *);
 static void coda_put_super(struct super_block *);
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static kmem_cache_t * coda_inode_cachep;
 
@@ -278,13 +278,13 @@ struct inode_operations coda_file_inode_operations = {
 	.setattr	= coda_setattr,
 };
 
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int error;
 	
 	lock_kernel();
 
-	error = venus_statfs(sb, buf);
+	error = venus_statfs(dentry, buf);
 
 	unlock_kernel();
 
@@ -307,10 +307,10 @@ static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
 
 /* init_coda: used by filesystems.c to register coda */
 
-static struct super_block *coda_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int coda_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, coda_fill_super);
+	return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
 }
 
 struct file_system_type coda_fs_type = {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1bae99650a91..b040eba13a7d 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -611,7 +611,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 	return error;
 }
 
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs) 
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 { 
         union inputArgs *inp;
         union outputArgs *outp;
@@ -620,7 +620,7 @@ int venus_statfs(struct super_block *sb, struct kstatfs *sfs)
 	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
 	UPARG(CODA_STATFS);
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+        error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp);
 	
         if (!error) {
 		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
diff --git a/fs/compat.c b/fs/compat.c
index b1f64786a613..7e7e5bc4f3cf 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,7 +197,7 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
 		path_release(&nd);
@@ -215,7 +215,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
 	fput(file);
@@ -265,7 +265,7 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
 		path_release(&nd);
@@ -286,7 +286,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
 	fput(file);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d2c38875ab29..9eb9824dd332 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -205,38 +205,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
 }
 
-struct compat_dmx_event {
-	dmx_event_t	event;
-	compat_time_t	timeStamp;
-	union
-	{
-		dmx_scrambling_status_t scrambling;
-	} u;
-};
-
-static int do_dmx_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct dmx_event kevent;
-	mm_segment_t old_fs = get_fs();
-	int err;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
-	set_fs(old_fs);
-
-	if (!err) {
-		struct compat_dmx_event __user *up = compat_ptr(arg);
-
-		err  = put_user(kevent.event, &up->event);
-		err |= put_user(kevent.timeStamp, &up->timeStamp);
-		err |= put_user(kevent.u.scrambling, &up->u.scrambling);
-		if (err)
-			err = -EFAULT;
-	}
-
-	return err;
-}
-
 struct compat_video_event {
 	int32_t		type;
 	compat_time_t	timestamp;
@@ -2964,7 +2932,6 @@ HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata)
 #endif
 
 /* dvb */
-HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
 HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e5..3e5fe843e1df 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -103,10 +103,10 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+	return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
 }
 
 static struct file_system_type configfs_fs_type = {
@@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = {
 
 int configfs_pin_fs(void)
 {
-	return simple_pin_fs("configfs", &configfs_mount,
+	return simple_pin_fs(&configfs_fs_type, &configfs_mount,
 			     &configfs_mnt_count);
 }
 
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9efcc3a164e8..c45d73860803 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -181,9 +181,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_cache_page(mapping, blocknr + i,
-				(filler_t *)mapping->a_ops->readpage,
-				NULL);
+			page = read_mapping_page(mapping, blocknr + i, NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
@@ -322,8 +320,10 @@ out:
 	return -EINVAL;
 }
 
-static int cramfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_blocks = CRAMFS_SB(sb)->blocks;
@@ -528,10 +528,11 @@ static struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static struct super_block *cramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int cramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dcache.c b/fs/dcache.c
index 940d188e5d14..b85fda360533 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -359,12 +359,13 @@ restart:
 }
 
 /*
- * Throw away a dentry - free the inode, dput the parent.
- * This requires that the LRU list has already been
- * removed.
+ * Throw away a dentry - free the inode, dput the parent.  This requires that
+ * the LRU list has already been removed.
+ *
  * Called with dcache_lock, drops it and then regains.
+ * Called with dentry->d_lock held, drops it.
  */
-static inline void prune_one_dentry(struct dentry * dentry)
+static void prune_one_dentry(struct dentry * dentry)
 {
 	struct dentry * parent;
 
@@ -382,6 +383,8 @@ static inline void prune_one_dentry(struct dentry * dentry)
 /**
  * prune_dcache - shrink the dcache
  * @count: number of entries to try and free
+ * @sb: if given, ignore dentries for other superblocks
+ *         which are being unmounted.
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -392,16 +395,29 @@ static inline void prune_one_dentry(struct dentry * dentry)
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count)
+static void prune_dcache(int count, struct super_block *sb)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
+		struct rw_semaphore *s_umount;
 
 		cond_resched_lock(&dcache_lock);
 
 		tmp = dentry_unused.prev;
+		if (sb) {
+			/* Try to find a dentry for this sb, but don't try
+			 * too hard, if they aren't near the tail they will
+			 * be moved down again soon
+			 */
+			int skip = count;
+			while (skip && tmp != &dentry_unused &&
+			    list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
+				skip--;
+				tmp = tmp->prev;
+			}
+		}
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
@@ -427,7 +443,45 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		prune_one_dentry(dentry);
+		/*
+		 * If the dentry is not DCACHED_REFERENCED, it is time
+		 * to remove it from the dcache, provided the super block is
+		 * NULL (which means we are trying to reclaim memory)
+		 * or this dentry belongs to the same super block that
+		 * we want to shrink.
+		 */
+		/*
+		 * If this dentry is for "my" filesystem, then I can prune it
+		 * without taking the s_umount lock (I already hold it).
+		 */
+		if (sb && dentry->d_sb == sb) {
+			prune_one_dentry(dentry);
+			continue;
+		}
+		/*
+		 * ...otherwise we need to be sure this filesystem isn't being
+		 * unmounted, otherwise we could race with
+		 * generic_shutdown_super(), and end up holding a reference to
+		 * an inode while the filesystem is unmounted.
+		 * So we try to get s_umount, and make sure s_root isn't NULL.
+		 * (Take a local copy of s_umount to avoid a use-after-free of
+		 * `dentry').
+		 */
+		s_umount = &dentry->d_sb->s_umount;
+		if (down_read_trylock(s_umount)) {
+			if (dentry->d_sb->s_root != NULL) {
+				prune_one_dentry(dentry);
+				up_read(s_umount);
+				continue;
+			}
+			up_read(s_umount);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* Cannot remove the first dentry, and it isn't appropriate
+		 * to move it to the head of the list, so give up, and try
+		 * later
+		 */
+		break;
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -630,46 +684,7 @@ void shrink_dcache_parent(struct dentry * parent)
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
-}
-
-/**
- * shrink_dcache_anon - further prune the cache
- * @head: head of d_hash list of dentries to prune
- *
- * Prune the dentries that are anonymous
- *
- * parsing d_hash list does not hlist_for_each_entry_rcu() as it
- * done under dcache_lock.
- *
- */
-void shrink_dcache_anon(struct hlist_head *head)
-{
-	struct hlist_node *lp;
-	int found;
-	do {
-		found = 0;
-		spin_lock(&dcache_lock);
-		hlist_for_each(lp, head) {
-			struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-			if (!list_empty(&this->d_lru)) {
-				dentry_stat.nr_unused--;
-				list_del_init(&this->d_lru);
-			}
-
-			/* 
-			 * move only zero ref count dentries to the end 
-			 * of the unused list for prune_dcache
-			 */
-			if (!atomic_read(&this->d_count)) {
-				list_add_tail(&this->d_lru, &dentry_unused);
-				dentry_stat.nr_unused++;
-				found++;
-			}
-		}
-		spin_unlock(&dcache_lock);
-		prune_dcache(found);
-	} while(found);
+		prune_dcache(found, parent->d_sb);
 }
 
 /*
@@ -689,7 +704,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr);
+		prune_dcache(nr, NULL);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 85d166cdcae4..6fa1e04f8415 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -67,12 +67,13 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
 			 int mode, dev_t dev)
 {
-	struct inode *inode = debugfs_get_inode(dir->i_sb, mode, dev);
+	struct inode *inode;
 	int error = -EPERM;
 
 	if (dentry->d_inode)
 		return -EEXIST;
 
+	inode = debugfs_get_inode(dir->i_sb, mode, dev);
 	if (inode) {
 		d_instantiate(dentry, inode);
 		dget(dentry);
@@ -110,11 +111,11 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static struct super_block *debug_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int debug_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, debug_fill_super);
+	return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
 }
 
 static struct file_system_type debug_fs_type = {
@@ -198,7 +199,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
 
 	pr_debug("debugfs: creating file '%s'\n",name);
 
-	error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count);
+	error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
 	if (error)
 		goto exit;
 
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 52f5059c4f31..51a97f132745 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2549,11 +2549,11 @@ static int devfs_fill_super(struct super_block *sb, void *data, int silent)
 	return -EINVAL;
 }				/*  End Function devfs_fill_super  */
 
-static struct super_block *devfs_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int devfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devfs_fill_super);
+	return get_sb_single(fs_type, flags, data, devfs_fill_super, mnt);
 }
 
 static struct file_system_type devfs_fs_type = {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14c5620b5cab..f7aef5bb584a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,10 +130,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int devpts_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super);
+	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
 
 static struct file_system_type devpts_fs_type = {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b05d1b218776..538fb0418fba 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
-		create = dio->rw == WRITE;
+		create = dio->rw & WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
-				if (dio->rw == WRITE) {
+				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					return -ENOTBLK;
 				}
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && rw == WRITE) {
+	if (ret == -ENOTBLK && (rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->is_async) {
 		int should_wait = 0;
 
-		if (dio->result < dio->size && rw == WRITE) {
+		if (dio->result < dio->size && (rw & WRITE)) {
 			dio->waiter = current;
 			should_wait = 1;
 		}
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 			ret = transferred;
 
 		/* We could have also come here on an AIO file extend */
-		if (!is_sync_kiocb(iocb) && rw == WRITE &&
+		if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
 		    ret >= 0 && dio->result == dio->size)
 			/*
 			 * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		current->flags |= PF_SYNCWRITE;
+		rw = WRITE_SYNC;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
 		mutex_unlock(&inode->i_mutex);
 	else if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
-	if (rw & WRITE)
-		current->flags &= ~PF_SYNCWRITE;
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index dff623e3ddbf..8ac2462ae5dd 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,13 +15,13 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
-static struct super_block *efs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int efs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
 }
 
 static struct file_system_type efs_fs_type = {
@@ -322,8 +322,8 @@ out_no_fs:
 	return -EINVAL;
 }
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf) {
-	struct efs_sb_info *sb = SUPER_INFO(s);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
+	struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
 
 	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
 	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1b4491cdd115..9c677bbd0b08 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
  *  fs/eventpoll.c ( Efficent event polling implementation )
- *  Copyright (C) 2001,...,2003	 Davide Libenzi
+ *  Copyright (C) 2001,...,2006	 Davide Libenzi
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -268,9 +268,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout);
 static int eventpollfs_delete_dentry(struct dentry *dentry);
 static struct inode *ep_eventpoll_inode(void);
-static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data);
+static int eventpollfs_get_sb(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, struct vfsmount *mnt);
 
 /*
  * This semaphore is used to serialize ep_free() and eventpoll_release_file().
@@ -337,20 +337,20 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 /* Special initialization for the rb-tree node to detect linkage */
 static inline void ep_rb_initnode(struct rb_node *n)
 {
-	n->rb_parent = n;
+	rb_set_parent(n, n);
 }
 
 /* Removes a node from the rb-tree and marks it for a fast is-linked check */
 static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r)
 {
 	rb_erase(n, r);
-	n->rb_parent = n;
+	rb_set_parent(n, n);
 }
 
 /* Fast check to verify that the item is linked to the main rb-tree */
 static inline int ep_rb_linked(struct rb_node *n)
 {
-	return n->rb_parent != n;
+	return rb_parent(n) != n;
 }
 
 /*
@@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
+			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
@@ -1083,7 +1083,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 
 				/* Notify waiting tasks that events are available */
 				if (waitqueue_active(&ep->wq))
-					wake_up(&ep->wq);
+					__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+							 TASK_INTERRUPTIBLE);
 				if (waitqueue_active(&ep->poll_wait))
 					pwake++;
 			}
@@ -1260,7 +1261,8 @@ is_linked:
 	 * wait list.
 	 */
 	if (waitqueue_active(&ep->wq))
-		wake_up(&ep->wq);
+		__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+				 TASK_INTERRUPTIBLE);
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
@@ -1444,7 +1446,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
 		 * wait list.
 		 */
 		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
+			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+					 TASK_INTERRUPTIBLE);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
@@ -1516,7 +1519,7 @@ retry:
 		 * ep_poll_callback() when events will become available.
 		 */
 		init_waitqueue_entry(&wait, current);
-		add_wait_queue(&ep->wq, &wait);
+		__add_wait_queue(&ep->wq, &wait);
 
 		for (;;) {
 			/*
@@ -1536,7 +1539,7 @@ retry:
 			jtimeout = schedule_timeout(jtimeout);
 			write_lock_irqsave(&ep->lock, flags);
 		}
-		remove_wait_queue(&ep->wq, &wait);
+		__remove_wait_queue(&ep->wq, &wait);
 
 		set_current_state(TASK_RUNNING);
 	}
@@ -1595,11 +1598,12 @@ eexit_1:
 }
 
 
-static struct super_block *
+static int
 eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
-		   const char *dev_name, void *data)
+		   const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
+			     mnt);
 }
 
 
diff --git a/fs/exec.c b/fs/exec.c
index 3a79d97ac234..0b88bf646143 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -49,6 +49,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -865,7 +866,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;		/* We're using it now */
 
 	/* This is the point of no return */
-	steal_locks(files);
 	put_files_struct(files);
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
@@ -1085,6 +1085,11 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	/* kernel module loader fixup */
 	/* so we don't try to load run modprobe in kernel space. */
 	set_fs(USER_DS);
+
+	retval = audit_bprm(bprm);
+	if (retval)
+		return retval;
+
 	retval = -ENOENT;
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index c5d02da73bc3..e0b2b43c1fdb 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT2_FS) += ext2.o
 
-ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
 	  ioctl.o namei.o super.o symlink.o
 
 ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2c00953d4b0b..433a213a8bd9 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -521,6 +521,26 @@ io_error:
 	goto out_release;
 }
 
+#ifdef EXT2FS_DEBUG
+
+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+
+	if (!map)
+		return (0);
+	for (i = 0; i < numchars; i++)
+		sum += nibblemap[map->b_data[i] & 0xf] +
+			nibblemap[(map->b_data[i] >> 4) & 0xf];
+	return (sum);
+}
+
+#endif  /*  EXT2FS_DEBUG  */
+
+/* Superblock must be locked */
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
 	struct ext2_group_desc * desc;
@@ -530,7 +550,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
 	unsigned long bitmap_count, x;
 	struct ext2_super_block *es;
 
-	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
 	desc_count = 0;
 	bitmap_count = 0;
@@ -554,7 +573,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
 	printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
 		(long)le32_to_cpu(es->s_free_blocks_count),
 		desc_count, bitmap_count);
-	unlock_super (sb);
 	return bitmap_count;
 #else
         for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c
deleted file mode 100644
index e9983a0dd396..000000000000
--- a/fs/ext2/bitmap.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  linux/fs/ext2/bitmap.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-
-#ifdef EXT2FS_DEBUG
-
-#include <linux/buffer_head.h>
-
-#include "ext2.h"
-
-static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-
-unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
-{
-	unsigned int i;
-	unsigned long sum = 0;
-	
-	if (!map) 
-		return (0);
-	for (i = 0; i < numchars; i++)
-		sum += nibblemap[map->b_data[i] & 0xf] +
-			nibblemap[(map->b_data[i] >> 4) & 0xf];
-	return (sum);
-}
-
-#endif  /*  EXT2FS_DEBUG  */
-
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d672aa9f4061..92ea8265d7d5 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -159,8 +159,7 @@ fail:
 static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
@@ -400,8 +399,7 @@ ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
 	de = ext2_find_entry (dir, dentry, &page);
 	if (de) {
 		res = le32_to_cpu(de->inode);
-		kunmap(page);
-		page_cache_release(page);
+		ext2_put_page(page);
 	}
 	return res;
 }
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
index c9c2e5ffa48e..7806b9e8155b 100644
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -24,7 +24,7 @@
 
 #include "ext2.h"
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>		/* for fsync_inode_buffers() */
+#include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
 
 
 /*
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index e52765219e16..308c252568c6 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -638,6 +638,7 @@ fail:
 	return ERR_PTR(err);
 }
 
+/* Superblock must be locked */
 unsigned long ext2_count_free_inodes (struct super_block * sb)
 {
 	struct ext2_group_desc *desc;
@@ -649,7 +650,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
 	unsigned long bitmap_count = 0;
 	struct buffer_head *bitmap_bh = NULL;
 
-	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
 	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
 		unsigned x;
@@ -672,7 +672,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
 	printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
 		percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
 		desc_count, bitmap_count);
-	unlock_super(sb);
 	return desc_count;
 #else
 	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7e30bae174ed..d4233b2e6436 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -39,7 +39,7 @@
 static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -834,9 +834,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
 			       GFP_KERNEL);
@@ -857,12 +854,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	if (!ext2_check_descriptors (sb)) {
 		printk ("EXT2-fs: group descriptors corrupted!\n");
-		db_count = i;
 		goto failed_mount2;
 	}
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+				ext2_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext2_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+				ext2_count_dirs(sb));
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -874,24 +877,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		iput(root);
 		printk(KERN_ERR "EXT2-fs: get root inode failed\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
 		ext2_warning(sb, __FUNCTION__,
 			"mounting ext3 filesystem as ext2");
 	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-				ext2_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-				ext2_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-				ext2_count_dirs(sb));
 	return 0;
 
 cantfind_ext2:
@@ -899,7 +896,10 @@ cantfind_ext2:
 		printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
 		       sb->s_id);
 	goto failed_mount;
-
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -1038,12 +1038,14 @@ restore_opts:
 	return err;
 }
 
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	unsigned long overhead;
 	int i;
 
+	lock_super(sb);
 	if (test_opt (sb, MINIX_DF))
 		overhead = 0;
 	else {
@@ -1084,13 +1086,14 @@ static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
 	buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count);
 	buf->f_ffree = ext2_count_free_inodes (sb);
 	buf->f_namelen = EXT2_NAME_LEN;
+	unlock_super(sb);
 	return 0;
 }
 
-static struct super_block *ext2_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext2_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
 }
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 77927d6938f6..96172e89ddc3 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -163,20 +163,19 @@ restart:
 #endif
 
 static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 			unsigned int group, struct super_block * sb)
 {
-	unsigned long group_first_block, group_last_block;
+	ext3_fsblk_t group_first_block, group_last_block;
 
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-				group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 	group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
 	if ((rsv->_rsv_start > group_last_block) ||
 	    (rsv->_rsv_end < group_first_block))
 		return 0;
-	if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
-		|| (goal + group_first_block > rsv->_rsv_end)))
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
 		return 0;
 	return 1;
 }
@@ -187,7 +186,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
  * Returns NULL if there are no windows or if all windows start after the goal.
  */
 static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, unsigned long goal)
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
 {
 	struct rb_node *n = root->rb_node;
 	struct ext3_reserve_window_node *rsv;
@@ -223,7 +222,7 @@ void ext3_rsv_window_add(struct super_block *sb,
 {
 	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
 	struct rb_node *node = &rsv->rsv_node;
-	unsigned int start = rsv->rsv_start;
+	ext3_fsblk_t start = rsv->rsv_start;
 
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
@@ -310,20 +309,20 @@ void ext3_discard_reservation(struct inode *inode)
 
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
-			 unsigned long block, unsigned long count,
-			 int *pdquot_freed_blocks)
+			 ext3_fsblk_t block, unsigned long count,
+			 unsigned long *pdquot_freed_blocks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gd_bh;
 	unsigned long block_group;
-	unsigned long bit;
+	ext3_grpblk_t bit;
 	unsigned long i;
 	unsigned long overflow;
 	struct ext3_group_desc * desc;
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi;
 	int err = 0, ret;
-	unsigned group_freed;
+	ext3_grpblk_t group_freed;
 
 	*pdquot_freed_blocks = 0;
 	sbi = EXT3_SB(sb);
@@ -333,7 +332,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	    block + count > le32_to_cpu(es->s_blocks_count)) {
 		ext3_error (sb, "ext3_free_blocks",
 			    "Freeing blocks not in datazone - "
-			    "block = %lu, count = %lu", block, count);
+			    "block = "E3FSBLK", count = %lu", block, count);
 		goto error_return;
 	}
 
@@ -369,7 +368,7 @@ do_more:
 		      sbi->s_itb_per_group))
 		ext3_error (sb, "ext3_free_blocks",
 			    "Freeing blocks in system zones - "
-			    "Block = %lu, count = %lu",
+			    "Block = "E3FSBLK", count = %lu",
 			    block, count);
 
 	/*
@@ -453,7 +452,8 @@ do_more:
 						bit + i, bitmap_bh->b_data)) {
 			jbd_unlock_bh_state(bitmap_bh);
 			ext3_error(sb, __FUNCTION__,
-				"bit already cleared for block %lu", block + i);
+				"bit already cleared for block "E3FSBLK,
+				 block + i);
 			jbd_lock_bh_state(bitmap_bh);
 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
 		} else {
@@ -493,10 +493,10 @@ error_return:
 
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
-			unsigned long block, unsigned long count)
+			ext3_fsblk_t block, unsigned long count)
 {
 	struct super_block * sb;
-	int dquot_freed_blocks;
+	unsigned long dquot_freed_blocks;
 
 	sb = inode->i_sb;
 	if (!sb) {
@@ -525,7 +525,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
  * data-writes at some point, and disable it for metadata allocations or
  * sync-data inodes.
  */
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
 {
 	int ret;
 	struct journal_head *jh = bh2jh(bh);
@@ -542,11 +542,11 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
 	return ret;
 }
 
-static int
-bitmap_search_next_usable_block(int start, struct buffer_head *bh,
-					int maxblocks)
+static ext3_grpblk_t
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+					ext3_grpblk_t maxblocks)
 {
-	int next;
+	ext3_grpblk_t next;
 	struct journal_head *jh = bh2jh(bh);
 
 	/*
@@ -576,10 +576,11 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh,
  * the initial goal; then for a free byte somewhere in the bitmap; then
  * for any free bit in the bitmap.
  */
-static int
-find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+static ext3_grpblk_t
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+			ext3_grpblk_t maxblocks)
 {
-	int here, next;
+	ext3_grpblk_t here, next;
 	char *p, *r;
 
 	if (start > 0) {
@@ -591,7 +592,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
 		 * next 64-bit boundary is simple..
 		 */
-		int end_goal = (start + 63) & ~63;
+		ext3_grpblk_t end_goal = (start + 63) & ~63;
 		if (end_goal > maxblocks)
 			end_goal = maxblocks;
 		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
@@ -628,7 +629,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
  * zero (failure).
  */
 static inline int
-claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
 {
 	struct journal_head *jh = bh2jh(bh);
 	int ret;
@@ -651,19 +652,18 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
  * new bitmap.  In that case we must release write access to the old one via
  * ext3_journal_release_buffer(), else we'll run out of credits.
  */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-			struct buffer_head *bitmap_bh, int goal,
+			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
 			unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
-	int group_first_block, start, end;
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t start, end;
 	unsigned long num = 0;
 
 	/* we do allocation within the reservation window if we have a window */
 	if (my_rsv) {
-		group_first_block =
-			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-			group * EXT3_BLOCKS_PER_GROUP(sb);
+		group_first_block = ext3_group_first_block_no(sb, group);
 		if (my_rsv->_rsv_start >= group_first_block)
 			start = my_rsv->_rsv_start - group_first_block;
 		else
@@ -673,13 +673,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 		if (end > EXT3_BLOCKS_PER_GROUP(sb))
 			/* reservation window crosses group boundary */
 			end = EXT3_BLOCKS_PER_GROUP(sb);
-		if ((start <= goal) && (goal < end))
-			start = goal;
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
 		else
-			goal = -1;
+			grp_goal = -1;
 	} else {
-		if (goal > 0)
-			start = goal;
+		if (grp_goal > 0)
+			start = grp_goal;
 		else
 			start = 0;
 		end = EXT3_BLOCKS_PER_GROUP(sb);
@@ -688,43 +688,43 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
 
 repeat:
-	if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
-		goal = find_next_usable_block(start, bitmap_bh, end);
-		if (goal < 0)
+	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
 			goto fail_access;
 		if (!my_rsv) {
 			int i;
 
-			for (i = 0; i < 7 && goal > start &&
-					ext3_test_allocatable(goal - 1,
+			for (i = 0; i < 7 && grp_goal > start &&
+					ext3_test_allocatable(grp_goal - 1,
 								bitmap_bh);
-					i++, goal--)
+					i++, grp_goal--)
 				;
 		}
 	}
-	start = goal;
+	start = grp_goal;
 
-	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
 		/*
 		 * The block was allocated by another thread, or it was
 		 * allocated and then freed by another thread
 		 */
 		start++;
-		goal++;
+		grp_goal++;
 		if (start >= end)
 			goto fail_access;
 		goto repeat;
 	}
 	num++;
-	goal++;
-	while (num < *count && goal < end
-		&& ext3_test_allocatable(goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& ext3_test_allocatable(grp_goal, bitmap_bh)
+		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
 		num++;
-		goal++;
+		grp_goal++;
 	}
 	*count = num;
-	return goal - num;
+	return grp_goal - num;
 fail_access:
 	*count = num;
 	return -1;
@@ -766,12 +766,13 @@ fail_access:
 static int find_next_reservable_window(
 				struct ext3_reserve_window_node *search_head,
 				struct ext3_reserve_window_node *my_rsv,
-				struct super_block * sb, int start_block,
-				int last_block)
+				struct super_block * sb,
+				ext3_fsblk_t start_block,
+				ext3_fsblk_t last_block)
 {
 	struct rb_node *next;
 	struct ext3_reserve_window_node *rsv, *prev;
-	int cur;
+	ext3_fsblk_t cur;
 	int size = my_rsv->rsv_goal_size;
 
 	/* TODO: make the start of the reservation window byte-aligned */
@@ -873,10 +874,10 @@ static int find_next_reservable_window(
  *
  *	@rsv: the reservation
  *
- *	@goal: The goal (group-relative).  It is where the search for a
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
  *		free reservable space should start from.
- *		if we have a goal(goal >0 ), then start from there,
- *		no goal(goal = -1), we start from the first block
+ *		if we have a grp_goal(grp_goal >0 ), then start from there,
+ *		no grp_goal(grp_goal = -1), we start from the first block
  *		of the group.
  *
  *	@sb: the super block
@@ -885,25 +886,24 @@ static int find_next_reservable_window(
  *
  */
 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
-		int goal, struct super_block *sb,
+		ext3_grpblk_t grp_goal, struct super_block *sb,
 		unsigned int group, struct buffer_head *bitmap_bh)
 {
 	struct ext3_reserve_window_node *search_head;
-	int group_first_block, group_end_block, start_block;
-	int first_free_block;
+	ext3_fsblk_t group_first_block, group_end_block, start_block;
+	ext3_grpblk_t first_free_block;
 	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
 	unsigned long size;
 	int ret;
 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
 
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-				group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 	group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
-	if (goal < 0)
+	if (grp_goal < 0)
 		start_block = group_first_block;
 	else
-		start_block = goal + group_first_block;
+		start_block = grp_goal + group_first_block;
 
 	size = my_rsv->rsv_goal_size;
 
@@ -1057,14 +1057,15 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
  * sorted double linked list should be fast.
  *
  */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			unsigned int group, struct buffer_head *bitmap_bh,
-			int goal, struct ext3_reserve_window_node * my_rsv,
+			ext3_grpblk_t grp_goal,
+			struct ext3_reserve_window_node * my_rsv,
 			unsigned long *count, int *errp)
 {
-	unsigned long group_first_block;
-	int ret = 0;
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t ret = 0;
 	int fatal;
 	unsigned long num = *count;
 
@@ -1090,17 +1091,16 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 */
 	if (my_rsv == NULL ) {
 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-						goal, count, NULL);
+						grp_goal, count, NULL);
 		goto out;
 	}
 	/*
-	 * goal is a group relative block number (if there is a goal)
-	 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
 	 * first block is a filesystem wide block number
 	 * first block is the block number of the first block in this group
 	 */
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-			group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 
 	/*
 	 * Basically we will allocate a new block from inode's reservation
@@ -1119,24 +1119,24 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 */
 	while (1) {
 		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) {
+			!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
 			if (my_rsv->rsv_goal_size < *count)
 				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, goal, sb,
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
 							group, bitmap_bh);
 			if (ret < 0)
 				break;			/* failed */
 
-			if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb))
-				goal = -1;
-		} else if (goal > 0 && (my_rsv->rsv_end-goal+1) < *count)
+			if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
 			try_to_extend_reservation(my_rsv, sb,
-					*count-my_rsv->rsv_end + goal - 1);
+					*count-my_rsv->rsv_end + grp_goal - 1);
 
 		if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
 		    || (my_rsv->rsv_end < group_first_block))
 			BUG();
-		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
 					   &num, &my_rsv->rsv_window);
 		if (ret >= 0) {
 			my_rsv->rsv_alloc_hit += num;
@@ -1164,7 +1164,7 @@ out:
 
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
-	int free_blocks, root_blocks;
+	ext3_fsblk_t free_blocks, root_blocks;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
@@ -1200,19 +1200,20 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext3_new_blocks(handle_t *handle, struct inode *inode,
-			unsigned long goal, unsigned long *count, int *errp)
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
 	int group_no;
 	int goal_group;
-	int ret_block;
+	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
 	int bgi;			/* blockgroup iteration index */
-	int target_block;
 	int fatal = 0, err;
 	int performed_allocation = 0;
-	int free_blocks;
+	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
 	struct ext3_super_block *es;
@@ -1285,16 +1286,17 @@ retry:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
 				EXT3_BLOCKS_PER_GROUP(sb));
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, ret_block, my_rsv, &num, &fatal);
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, grp_target_blk,
+					my_rsv,	&num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0)
+		if (grp_alloc_blk >= 0)
 			goto allocated;
 	}
 
@@ -1327,11 +1329,15 @@ retry:
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, -1, my_rsv, &num, &fatal);
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, -1, my_rsv,
+					&num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0) 
+		if (grp_alloc_blk >= 0)
 			goto allocated;
 	}
 	/*
@@ -1360,18 +1366,18 @@ allocated:
 	if (fatal)
 		goto out;
 
-	target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
-				+ le32_to_cpu(es->s_first_data_block);
+	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
 
-	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) ||
-	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) ||
-	    in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group) ||
-	    in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group))
 		ext3_error(sb, "ext3_new_block",
 			    "Allocating block in system zone - "
-			    "blocks from %u, length %lu", target_block, num);
+			    "blocks from "E3FSBLK", length %lu",
+			     ret_block, num);
 
 	performed_allocation = 1;
 
@@ -1380,7 +1386,7 @@ allocated:
 		struct buffer_head *debug_bh;
 
 		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, target_block);
+		debug_bh = sb_find_get_block(sb, ret_block);
 		if (debug_bh) {
 			BUFFER_TRACE(debug_bh, "state when allocated");
 			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1393,24 +1399,21 @@ allocated:
 		int i;
 
 		for (i = 0; i < num; i++) {
-			if (ext3_test_bit(ret_block,
+			if (ext3_test_bit(grp_alloc_blk+i,
 					bh2jh(bitmap_bh)->b_committed_data)) {
 				printk("%s: block was unexpectedly set in "
 					"b_committed_data\n", __FUNCTION__);
 			}
 		}
 	}
-	ext3_debug("found bit %d\n", ret_block);
+	ext3_debug("found bit %d\n", grp_alloc_blk);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	jbd_unlock_bh_state(bitmap_bh);
 #endif
 
-	/* ret_block was blockgroup-relative.  Now it becomes fs-relative */
-	ret_block = target_block;
-
 	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
 		ext3_error(sb, "ext3_new_block",
-			    "block(%d) >= blocks count(%d) - "
+			    "block("E3FSBLK") >= blocks count(%d) - "
 			    "block_group = %d, es == %p ", ret_block,
 			le32_to_cpu(es->s_blocks_count), group_no, es);
 		goto out;
@@ -1421,7 +1424,7 @@ allocated:
 	 * list of some description.  We don't know in advance whether
 	 * the caller wants to use it as metadata or data.
 	 */
-	ext3_debug("allocating block %d. Goal hits %d of %d.\n",
+	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
 			ret_block, goal_hits, goal_attempts);
 
 	spin_lock(sb_bgl_lock(sbi, group_no));
@@ -1461,23 +1464,24 @@ out:
 	return 0;
 }
 
-int ext3_new_block(handle_t *handle, struct inode *inode,
-			unsigned long goal, int *errp)
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int *errp)
 {
 	unsigned long count = 1;
 
 	return ext3_new_blocks(handle, inode, goal, &count, errp);
 }
 
-unsigned long ext3_count_free_blocks(struct super_block *sb)
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 {
-	unsigned long desc_count;
+	ext3_fsblk_t desc_count;
 	struct ext3_group_desc *gdp;
 	int i;
 	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
 #ifdef EXT3FS_DEBUG
 	struct ext3_super_block *es;
-	unsigned long bitmap_count, x;
+	ext3_fsblk_t bitmap_count;
+	unsigned long x;
 	struct buffer_head *bitmap_bh = NULL;
 
 	es = EXT3_SB(sb)->s_es;
@@ -1502,8 +1506,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
-	       le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+	printk("ext3_count_free_blocks: stored = "E3FSBLK
+		", computed = "E3FSBLK", "E3FSBLK"\n",
+	       le32_to_cpu(es->s_free_blocks_count),
+		desc_count, bitmap_count);
 	return bitmap_count;
 #else
 	desc_count = 0;
@@ -1520,7 +1526,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 }
 
 static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
+block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
 {
 	return ext3_test_bit ((block -
 		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index f37528ed222e..fbb0d4ed07d4 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -284,7 +284,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 		 * beginning of the loop and try to free the parent
 		 * node.
 		 */
-		parent = n->rb_parent;
+		parent = rb_parent(n);
 		fname = rb_entry(n, struct fname, rb_hash);
 		while (fname) {
 			struct fname * old = fname;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dc826464f313..36546ed36a14 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -262,9 +262,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
 	int freei, avefreei;
-	int freeb, avefreeb;
-	int blocks_per_dir, ndirs;
-	int max_debt, max_dirs, min_blocks, min_inodes;
+	ext3_fsblk_t freeb, avefreeb;
+	ext3_fsblk_t blocks_per_dir;
+	int ndirs;
+	int max_debt, max_dirs, min_inodes;
+	ext3_grpblk_t min_blocks;
 	int group = -1, i;
 	struct ext3_group_desc *desc;
 	struct buffer_head *bh;
@@ -307,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	min_inodes = avefreei - inodes_per_group / 4;
 	min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
 
-	max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
+	max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
 	if (max_debt * INODE_COST > inodes_per_group)
 		max_debt = inodes_per_group / INODE_COST;
 	if (max_debt > 255)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2edd7eec88fd..0321e1b9034a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
  * still needs to be revoked.
  */
 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, int blocknr)
+			struct buffer_head *bh, ext3_fsblk_t blocknr)
 {
 	int err;
 
@@ -407,13 +407,13 @@ no_block:
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
-static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 	__le32 *p;
-	unsigned long bg_start;
-	unsigned long colour;
+	ext3_fsblk_t bg_start;
+	ext3_grpblk_t colour;
 
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
@@ -429,8 +429,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
-	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
-		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
 	colour = (current->pid % 16) *
 			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	return bg_start + colour;
@@ -448,7 +447,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
  *	stores it in *@goal and returns zero.
  */
 
-static unsigned long ext3_find_goal(struct inode *inode, long block,
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 		Indirect chain[4], Indirect *partial)
 {
 	struct ext3_block_alloc_info *block_i;
@@ -516,13 +515,13 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
  *		direct blocks
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
-			unsigned long goal, int indirect_blks, int blks,
-			unsigned long long new_blocks[4], int *err)
+			ext3_fsblk_t goal, int indirect_blks, int blks,
+			ext3_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
 	unsigned long count = 0;
 	int index = 0;
-	unsigned long current_block = 0;
+	ext3_fsblk_t current_block = 0;
 	int ret = 0;
 
 	/*
@@ -592,7 +591,7 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, unsigned long goal,
+			int indirect_blks, int *blks, ext3_fsblk_t goal,
 			int *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
@@ -600,8 +599,8 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
-	unsigned long long new_blocks[4];
-	unsigned long long current_block;
+	ext3_fsblk_t new_blocks[4];
+	ext3_fsblk_t current_block;
 
 	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
 				*blks, new_blocks, &err);
@@ -688,7 +687,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 	int i;
 	int err = 0;
 	struct ext3_block_alloc_info *block_i;
-	unsigned long current_block;
+	ext3_fsblk_t current_block;
 
 	block_i = EXT3_I(inode)->i_block_alloc_info;
 	/*
@@ -795,13 +794,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
-	unsigned long goal;
+	ext3_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	int count = 0;
-	unsigned long first_block = 0;
+	ext3_fsblk_t first_block = 0;
 
 
 	J_ASSERT(handle != NULL || create == 0);
@@ -819,7 +818,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
-			unsigned long blk;
+			ext3_fsblk_t blk;
 
 			if (!verify_chain(chain, partial)) {
 				/*
@@ -1759,7 +1758,7 @@ void ext3_set_aops(struct inode *inode)
 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
-	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
 	struct inode *inode = mapping->host;
@@ -1960,7 +1959,7 @@ no_top:
  * than `count' because there can be holes in there.
  */
 static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
-		struct buffer_head *bh, unsigned long block_to_free,
+		struct buffer_head *bh, ext3_fsblk_t block_to_free,
 		unsigned long count, __le32 *first, __le32 *last)
 {
 	__le32 *p;
@@ -2022,12 +2021,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 			   struct buffer_head *this_bh,
 			   __le32 *first, __le32 *last)
 {
-	unsigned long block_to_free = 0;    /* Starting block # of a run */
+	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
 	unsigned long count = 0;	    /* Number of blocks in the run */ 
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
-	unsigned long nr;		    /* Current block # */
+	ext3_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
 	int err;
@@ -2089,7 +2088,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			       struct buffer_head *parent_bh,
 			       __le32 *first, __le32 *last, int depth)
 {
-	unsigned long nr;
+	ext3_fsblk_t nr;
 	__le32 *p;
 
 	if (is_handle_aborted(handle))
@@ -2113,7 +2112,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			 */
 			if (!bh) {
 				ext3_error(inode->i_sb, "ext3_free_branches",
-					   "Read failure, inode=%ld, block=%ld",
+					   "Read failure, inode=%ld, block="E3FSBLK,
 					   inode->i_ino, nr);
 				continue;
 			}
@@ -2394,11 +2393,12 @@ out_stop:
 	ext3_journal_stop(handle);
 }
 
-static unsigned long ext3_get_inode_block(struct super_block *sb,
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
 		unsigned long ino, struct ext3_iloc *iloc)
 {
 	unsigned long desc, group_desc, block_group;
-	unsigned long offset, block;
+	unsigned long offset;
+	ext3_fsblk_t block;
 	struct buffer_head *bh;
 	struct ext3_group_desc * gdp;
 
@@ -2448,7 +2448,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb,
 static int __ext3_get_inode_loc(struct inode *inode,
 				struct ext3_iloc *iloc, int in_mem)
 {
-	unsigned long block;
+	ext3_fsblk_t block;
 	struct buffer_head *bh;
 
 	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
@@ -2459,7 +2459,8 @@ static int __ext3_get_inode_loc(struct inode *inode,
 	if (!bh) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 				"unable to read inode block - "
-				"inode=%lu, block=%lu", inode->i_ino, block);
+				"inode=%lu, block="E3FSBLK,
+				 inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -2540,7 +2541,7 @@ make_io:
 		if (!buffer_uptodate(bh)) {
 			ext3_error(inode->i_sb, "ext3_get_inode_loc",
 					"unable to read inode block - "
-					"inode=%lu, block=%lu",
+					"inode=%lu, block="E3FSBLK,
 					inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8c22aa9a7fbb..3a6b012d120c 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -204,7 +204,7 @@ flags_err:
 		return 0;
 	}
 	case EXT3_IOC_GROUP_EXTEND: {
-		unsigned long n_blocks_count;
+		ext3_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
 		int err;
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b8f5cd1e540d..d9176dba3698 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1379,7 +1379,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 	int	dx_fallback=0;
 #endif
 	unsigned blocksize;
-	unsigned nlen, rlen;
 	u32 block, blocks;
 
 	sb = dir->i_sb;
@@ -1417,8 +1416,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 		return retval;
 	de = (struct ext3_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = cpu_to_le16(rlen = blocksize);
-	nlen = 0;
+	de->rec_len = cpu_to_le16(blocksize);
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8aac5334680d..dfd811895d8f 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -28,16 +28,16 @@ static int verify_group_input(struct super_block *sb,
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	unsigned start = le32_to_cpu(es->s_blocks_count);
-	unsigned end = start + input->blocks_count;
+	ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
+	ext3_fsblk_t end = start + input->blocks_count;
 	unsigned group = input->group;
-	unsigned itend = input->inode_table + sbi->s_itb_per_group;
+	ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
 	unsigned overhead = ext3_bg_has_super(sb, group) ?
 		(1 + ext3_bg_num_gdb(sb, group) +
 		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
-	unsigned metaend = start + overhead;
+	ext3_fsblk_t metaend = start + overhead;
 	struct buffer_head *bh = NULL;
-	int free_blocks_count;
+	ext3_grpblk_t free_blocks_count;
 	int err = -EINVAL;
 
 	input->free_blocks_count = free_blocks_count =
@@ -64,7 +64,8 @@ static int verify_group_input(struct super_block *sb,
 		ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
 			     input->blocks_count);
 	else if (!(bh = sb_bread(sb, end - 1)))
-		ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)",
+		ext3_warning(sb, __FUNCTION__,
+			     "Cannot read last block ("E3FSBLK")",
 			     end - 1);
 	else if (outside(input->block_bitmap, start, end))
 		ext3_warning(sb, __FUNCTION__,
@@ -77,7 +78,7 @@ static int verify_group_input(struct super_block *sb,
 	else if (outside(input->inode_table, start, end) ||
 	         outside(itend - 1, start, end))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode table not in group (blocks %u-%u)",
+			     "Inode table not in group (blocks %u-"E3FSBLK")",
 			     input->inode_table, itend - 1);
 	else if (input->inode_bitmap == input->block_bitmap)
 		ext3_warning(sb, __FUNCTION__,
@@ -85,24 +86,27 @@ static int verify_group_input(struct super_block *sb,
 			     input->block_bitmap);
 	else if (inside(input->block_bitmap, input->inode_table, itend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Block bitmap (%u) in inode table (%u-%u)",
+			     "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
 			     input->block_bitmap, input->inode_table, itend-1);
 	else if (inside(input->inode_bitmap, input->inode_table, itend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%u) in inode table (%u-%u)",
+			     "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
 			     input->inode_bitmap, input->inode_table, itend-1);
 	else if (inside(input->block_bitmap, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Block bitmap (%u) in GDT table (%u-%u)",
+			     "Block bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
 			     input->block_bitmap, start, metaend - 1);
 	else if (inside(input->inode_bitmap, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%u) in GDT table (%u-%u)",
+			     "Inode bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
 			     input->inode_bitmap, start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
 	         inside(itend - 1, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode table (%u-%u) overlaps GDT table (%u-%u)",
+			     "Inode table (%u-"E3FSBLK") overlaps"
+			     "GDT table ("E3FSBLK"-"E3FSBLK")",
 			     input->inode_table, itend - 1, start, metaend - 1);
 	else
 		err = 0;
@@ -112,7 +116,7 @@ static int verify_group_input(struct super_block *sb,
 }
 
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
-				  unsigned long blk)
+				  ext3_fsblk_t blk)
 {
 	struct buffer_head *bh;
 	int err;
@@ -163,15 +167,14 @@ static int setup_new_group_blocks(struct super_block *sb,
 				  struct ext3_new_group_data *input)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long start = input->group * sbi->s_blocks_per_group +
-		le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
 	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
 		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
 	unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
 	struct buffer_head *bh;
 	handle_t *handle;
-	unsigned long block;
-	int bit;
+	ext3_fsblk_t block;
+	ext3_grpblk_t bit;
 	int i;
 	int err = 0, err2;
 
@@ -328,7 +331,7 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
 static int verify_reserved_gdb(struct super_block *sb,
 			       struct buffer_head *primary)
 {
-	const unsigned long blk = primary->b_blocknr;
+	const ext3_fsblk_t blk = primary->b_blocknr;
 	const unsigned long end = EXT3_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
@@ -340,7 +343,8 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %ld missing grp %d (%ld)",
+				     "reserved GDT "E3FSBLK
+				     " missing grp %d ("E3FSBLK")",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -372,7 +376,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	struct super_block *sb = inode->i_sb;
 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
 	unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
-	unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
 	struct buffer_head **o_group_desc, **n_group_desc;
 	struct buffer_head *dind;
 	int gdbackups;
@@ -417,7 +421,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	data = (__u32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
 		ext3_warning(sb, __FUNCTION__,
-			     "new group %u GDT block %lu not reserved",
+			     "new group %u GDT block "E3FSBLK" not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
@@ -515,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	struct buffer_head **primary;
 	struct buffer_head *dind;
 	struct ext3_iloc iloc;
-	unsigned long blk;
+	ext3_fsblk_t blk;
 	__u32 *data, *end;
 	int gdbackups = 0;
 	int res, i;
@@ -540,7 +544,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved block %lu not at offset %ld",
+				     "reserved block "E3FSBLK
+				     " not at offset %ld",
 				     blk, (long)(data - (__u32 *)dind->b_data));
 			err = -EINVAL;
 			goto exit_bh;
@@ -767,7 +772,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	if (input->group != sbi->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
-		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_journal;
 	}
@@ -903,15 +907,16 @@ exit_put:
  * GDT blocks are reserved to grow to the desired size.
  */
 int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
-		      unsigned long n_blocks_count)
+		      ext3_fsblk_t n_blocks_count)
 {
-	unsigned long o_blocks_count;
+	ext3_fsblk_t o_blocks_count;
 	unsigned long o_groups_count;
-	unsigned long last;
-	int add;
+	ext3_grpblk_t last;
+	ext3_grpblk_t add;
 	struct buffer_head * bh;
 	handle_t *handle;
-	int err, freed_blocks;
+	int err;
+	unsigned long freed_blocks;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -920,12 +925,22 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	o_groups_count = EXT3_SB(sb)->s_groups_count;
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n",
+		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
 		       o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
 		return 0;
 
+	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+			" too large to resize to %lu blocks safely\n",
+			sb->s_id, n_blocks_count);
+		if (sizeof(sector_t) < 8)
+			ext3_warning(sb, __FUNCTION__,
+			"CONFIG_LBD not enabled\n");
+		return -EINVAL;
+	}
+
 	if (n_blocks_count < o_blocks_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "can't shrink FS - resize aborted");
@@ -949,7 +964,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	if (o_blocks_count + add < n_blocks_count)
 		ext3_warning(sb, __FUNCTION__,
-			     "will only finish group (%lu blocks, %u new)",
+			     "will only finish group ("E3FSBLK
+			     " blocks, %u new)",
 			     o_blocks_count + add, add);
 
 	/* See if the device is actually as big as what was requested */
@@ -992,10 +1008,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f8a5266ea1ff..b2891cc29db1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -58,7 +58,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait);
 static const char *ext3_decode_error(struct super_block * sb, int errno,
 				     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
@@ -499,20 +499,21 @@ static void ext3_clear_inode(struct inode *inode)
 {
 	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-       if (EXT3_I(inode)->i_acl &&
-           EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-               posix_acl_release(EXT3_I(inode)->i_acl);
-               EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
-       }
-       if (EXT3_I(inode)->i_default_acl &&
-           EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-               posix_acl_release(EXT3_I(inode)->i_default_acl);
-               EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
-       }
+	if (EXT3_I(inode)->i_acl &&
+			EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
+		posix_acl_release(EXT3_I(inode)->i_acl);
+		EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
+	}
+	if (EXT3_I(inode)->i_default_acl &&
+			EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
+		posix_acl_release(EXT3_I(inode)->i_default_acl);
+		EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
+	}
 #endif
 	ext3_discard_reservation(inode);
 	EXT3_I(inode)->i_block_alloc_info = NULL;
-	kfree(rsv);
+	if (unlikely(rsv))
+		kfree(rsv);
 }
 
 static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -688,14 +689,15 @@ static match_table_t tokens = {
 	{Opt_resize, "resize"},
 };
 
-static unsigned long get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data)
 {
-	unsigned long 	sb_block;
+	ext3_fsblk_t 	sb_block;
 	char 		*options = (char *) *data;
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
 	options += 3;
+	/*todo: use simple_strtoll with >32bit ext3 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
 		printk("EXT3-fs: Invalid sb specification: %s\n",
@@ -710,7 +712,7 @@ static unsigned long get_sb_block(void **data)
 
 static int parse_options (char *options, struct super_block *sb,
 			  unsigned long *inum, unsigned long *journal_devnum,
-			  unsigned long *n_blocks_count, int is_remount)
+			  ext3_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	char * p;
@@ -1127,7 +1129,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 static int ext3_check_descriptors (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	struct ext3_group_desc * gdp = NULL;
 	int desc_block = 0;
 	int i;
@@ -1314,15 +1316,14 @@ static loff_t ext3_max_size(int bits)
 	return res;
 }
 
-static unsigned long descriptor_loc(struct super_block *sb,
-				    unsigned long logic_sb_block,
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
+				    ext3_fsblk_t logic_sb_block,
 				    int nr)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long bg, first_data_block, first_meta_bg;
+	unsigned long bg, first_meta_bg;
 	int has_super = 0;
 
-	first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
 
 	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
@@ -1331,7 +1332,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
 	bg = sbi->s_desc_per_block * nr;
 	if (ext3_bg_has_super(sb, bg))
 		has_super = 1;
-	return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
+	return (has_super + ext3_group_first_block_no(sb, bg));
 }
 
 
@@ -1340,9 +1341,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	struct buffer_head * bh;
 	struct ext3_super_block *es = NULL;
 	struct ext3_sb_info *sbi;
-	unsigned long block;
-	unsigned long sb_block = get_sb_block(&data);
-	unsigned long logic_sb_block;
+	ext3_fsblk_t block;
+	ext3_fsblk_t sb_block = get_sb_block(&data);
+	ext3_fsblk_t logic_sb_block;
 	unsigned long offset = 0;
 	unsigned long journal_inum = 0;
 	unsigned long journal_devnum = 0;
@@ -1564,6 +1565,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+	if (le32_to_cpu(es->s_blocks_count) >
+		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+			" too large to mount safely\n", sb->s_id);
+		if (sizeof(sector_t) < 8)
+			printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+					"enabled\n");
+		goto failed_mount;
+	}
+
 	if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext3;
 	sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
@@ -1579,9 +1590,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -1595,12 +1603,20 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		}
 	}
 	if (!ext3_check_descriptors (sb)) {
-		printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+		printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
 		goto failed_mount2;
 	}
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+		ext3_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+		ext3_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+		ext3_count_dirs(sb));
+
 	/* per fileystem reservation list head & lock */
 	spin_lock_init(&sbi->s_rsv_window_lock);
 	sbi->s_rsv_window_root = RB_ROOT;
@@ -1639,16 +1655,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext3_load_journal(sb, es, journal_devnum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else {
 		if (!silent)
 			printk (KERN_ERR
 				"ext3: No journal on filesystem on %s\n",
 				sb->s_id);
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 
 	/* We have now updated the journal if required, so we can
@@ -1671,7 +1687,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
 			printk(KERN_ERR "EXT3-fs: Journal does not support "
 			       "requested data journaling mode\n");
-			goto failed_mount3;
+			goto failed_mount4;
 		}
 	default:
 		break;
@@ -1694,13 +1710,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		printk(KERN_ERR "EXT3-fs: get root inode failed\n");
 		iput(root);
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 
 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -1723,13 +1739,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-		ext3_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-		ext3_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-		ext3_count_dirs(sb));
-
 	lock_kernel();
 	return 0;
 
@@ -1739,8 +1748,12 @@ cantfind_ext3:
 		       sb->s_id);
 	goto failed_mount;
 
-failed_mount3:
+failed_mount4:
 	journal_destroy(sbi->s_journal);
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -1827,10 +1840,10 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 {
 	struct buffer_head * bh;
 	journal_t *journal;
-	int start;
-	int len;
+	ext3_fsblk_t start;
+	ext3_fsblk_t len;
 	int hblock, blocksize;
-	unsigned long sb_block;
+	ext3_fsblk_t sb_block;
 	unsigned long offset;
 	struct ext3_super_block * es;
 	struct block_device *bdev;
@@ -2203,7 +2216,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 {
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long n_blocks_count = 0;
+	ext3_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext3_mount_options old_opts;
 	int err;
@@ -2318,11 +2331,12 @@ restore_opts:
 	return err;
 }
 
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	unsigned long overhead;
+	ext3_fsblk_t overhead;
 	int i;
 
 	if (test_opt (sb, MINIX_DF))
@@ -2646,10 +2660,10 @@ out:
 
 #endif
 
-static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext3_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
 }
 
 static struct file_system_type ext3_fs_type = {
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e8d60bf6b7df..a44a0562203a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	error = -ENODATA;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh)
 		goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 	error = 0;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block %d", inode->i_ino,
+				"inode %ld: bad block "E3FSBLK, inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -792,11 +792,12 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			int goal = le32_to_cpu(
+			ext3_fsblk_t goal = le32_to_cpu(
 					EXT3_SB(sb)->s_es->s_first_data_block) +
-				EXT3_I(inode)->i_block_group *
+				(ext3_fsblk_t)EXT3_I(inode)->i_block_group *
 				EXT3_BLOCKS_PER_GROUP(sb);
-			int block = ext3_new_block(handle, inode, goal, &error);
+			ext3_fsblk_t block = ext3_new_block(handle, inode,
+							goal, &error);
 			if (error)
 				goto cleanup;
 			ea_idebug(inode, "creating block %d", block);
@@ -847,7 +848,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block %d", inode->i_ino,
+		   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1076,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block %d read error", inode->i_ino,
+			"inode %ld: block "E3FSBLK" read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %ld: bad block "E3FSBLK, inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
@@ -1210,11 +1211,11 @@ again:
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			ext3_error(inode->i_sb, __FUNCTION__,
-				"inode %ld: block %ld read error",
+				"inode %ld: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT3_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>=%d",
+			ea_idebug(inode, "block %lu refcount %d>=%d",
 				  (unsigned long) ce->e_block,
 				  le32_to_cpu(BHDR(bh)->h_refcount),
 					  EXT3_XATTR_REFCOUNT_MAX);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c1ce284f8a94..7c35d582ec10 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -539,18 +539,18 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static int fat_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 
 	/* If the count of free cluster is still unknown, counts it here. */
 	if (sbi->free_clusters == -1) {
-		int err = fat_count_free_clusters(sb);
+		int err = fat_count_free_clusters(dentry->d_sb);
 		if (err)
 			return err;
 	}
 
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = sbi->cluster_size;
 	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
 	buf->f_bfree = sbi->free_clusters;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 944652e9dde1..308f2b6b5026 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -210,4 +210,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 	return err;
 }
 
-EXPORT_SYMBOL_GPL(fat_sync_bhs);
diff --git a/fs/file_table.c b/fs/file_table.c
index bcea1998b4de..506d5307108d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -300,5 +300,5 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
-	percpu_counter_init(&nr_files);
+	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index 583bd78086d8..d35979a58743 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -159,11 +159,11 @@ struct vxfs_sb {
  * In core superblock filesystem private data for VxFS.
  */
 struct vxfs_sb_info {
-	struct vxfs_sb		*vsi_raw;	/* raw (on disk) supeblock */
+	struct vxfs_sb		*vsi_raw;	/* raw (on disk) superblock */
 	struct buffer_head	*vsi_bp;	/* buffer for raw superblock*/
 	struct inode		*vsi_fship;	/* fileset header inode */
 	struct inode		*vsi_ilist;	/* inode list inode */
-	struct inode		*vsi_stilist;	/* structual inode list inode */
+	struct inode		*vsi_stilist;	/* structural inode list inode */
 	u_long			vsi_iext;	/* initial inode list */
 	ino_t			vsi_fshino;	/* fileset header inode */
 	daddr_t			vsi_oltext;	/* OLT extent */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index 6dee109aeea4..78948b4b1894 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -112,7 +112,7 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
 	if (!vip) {
-		printk(KERN_ERR "vxfs: unabled to read fsh inode\n");
+		printk(KERN_ERR "vxfs: unable to read fsh inode\n");
 		return -EINVAL;
 	}
 	if (!VXFS_ISFSH(vip)) {
@@ -129,13 +129,13 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
 	if (!infp->vsi_fship) {
-		printk(KERN_ERR "vxfs: unabled to get fsh inode\n");
+		printk(KERN_ERR "vxfs: unable to get fsh inode\n");
 		goto out_free_fship;
 	}
 
 	sfp = vxfs_getfsh(infp->vsi_fship, 0);
 	if (!sfp) {
-		printk(KERN_ERR "vxfs: unabled to get structural fsh\n");
+		printk(KERN_ERR "vxfs: unable to get structural fsh\n");
 		goto out_iput_fship;
 	} 
 
@@ -145,7 +145,7 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	pfp = vxfs_getfsh(infp->vsi_fship, 1);
 	if (!pfp) {
-		printk(KERN_ERR "vxfs: unabled to get primary fsh\n");
+		printk(KERN_ERR "vxfs: unable to get primary fsh\n");
 		goto out_free_sfp;
 	}
 
@@ -159,7 +159,7 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
 	if (!infp->vsi_stilist) {
-		printk(KERN_ERR "vxfs: unabled to get structual list inode\n");
+		printk(KERN_ERR "vxfs: unable to get structural list inode\n");
 		kfree(tip);
 		goto out_free_pfp;
 	}
@@ -174,7 +174,7 @@ vxfs_read_fshead(struct super_block *sbp)
 		goto out_iput_stilist;
 	infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
 	if (!infp->vsi_ilist) {
-		printk(KERN_ERR "vxfs: unabled to get inode list inode\n");
+		printk(KERN_ERR "vxfs: unable to get inode list inode\n");
 		kfree(tip);
 		goto out_iput_stilist;
 	}
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 50aae77651b2..c1be118fc067 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -71,8 +71,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 {
 	struct page *			pp;
 
-	pp = read_cache_page(mapping, n,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	pp = read_mapping_page(mapping, n, NULL);
 
 	if (!IS_ERR(pp)) {
 		wait_on_page_locked(pp);
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b44c916d24a1..b74b791fc23b 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 
 #include "vxfs.h"
 #include "vxfs_extern.h"
@@ -55,7 +56,7 @@ MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
 
 
 static void		vxfs_put_super(struct super_block *);
-static int		vxfs_statfs(struct super_block *, struct kstatfs *);
+static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int		vxfs_remount(struct super_block *, int *, char *);
 
 static struct super_operations vxfs_super_ops = {
@@ -90,12 +91,12 @@ vxfs_put_super(struct super_block *sbp)
 
 /**
  * vxfs_statfs - get filesystem information
- * @sbp:	VFS superblock
+ * @dentry:	VFS dentry to locate superblock
  * @bufp:	output buffer
  *
  * Description:
  *   vxfs_statfs fills the statfs buffer @bufp with information
- *   about the filesystem described by @sbp.
+ *   about the filesystem described by @dentry.
  *
  * Returns:
  *   Zero.
@@ -107,12 +108,12 @@ vxfs_put_super(struct super_block *sbp)
  *   This is everything but complete...
  */
 static int
-vxfs_statfs(struct super_block *sbp, struct kstatfs *bufp)
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 {
-	struct vxfs_sb_info		*infp = VXFS_SBI(sbp);
+	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
 
 	bufp->f_type = VXFS_SUPER_MAGIC;
-	bufp->f_bsize = sbp->s_blocksize;
+	bufp->f_bsize = dentry->d_sb->s_blocksize;
 	bufp->f_blocks = infp->vsi_raw->vs_dsize;
 	bufp->f_bfree = infp->vsi_raw->vs_free;
 	bufp->f_bavail = 0;
@@ -241,10 +242,11 @@ out:
 /*
  * The usual module blurb.
  */
-static struct super_block *vxfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int vxfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vxfs_fs_type = {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3fbe2d030f4..031b27a4bc9a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -461,6 +461,8 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct writeback_control wbc = {
 		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.range_start	= 0,
+		.range_end	= LLONG_MAX,
 	};
 	unsigned long nr_dirty = read_page_state(nr_dirty);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
@@ -559,6 +561,8 @@ int write_inode_now(struct inode *inode, int sync)
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
 		.sync_mode = WB_SYNC_ALL,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
 	};
 
 	if (!mapping_cap_writeback_dirty(inode->i_mapping))
@@ -619,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 	int need_write_inode_now = 0;
 	int err2;
 
-	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -632,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 		if (!err)
 			err = err2;
 	}
-	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index c3e1f760cac9..72437065f6ad 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 
-fuse-objs := dev.o dir.o file.o inode.o
+fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 000000000000..a3bce3a77253
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,218 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists.  Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+	struct fuse_conn *fc;
+	mutex_lock(&fuse_mutex);
+	fc = file->f_dentry->d_inode->u.generic_ip;
+	if (fc)
+		fc = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
+	return fc;
+}
+
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+	if (fc) {
+		fuse_abort_conn(fc);
+		fuse_conn_put(fc);
+	}
+	return count;
+}
+
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+				      size_t len, loff_t *ppos)
+{
+	char tmp[32];
+	size_t size;
+
+	if (!*ppos) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (!fc)
+			return 0;
+
+		file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+		fuse_conn_put(fc);
+	}
+	size = sprintf(tmp, "%ld\n", (long)file->private_data);
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static const struct file_operations fuse_ctl_abort_ops = {
+	.open = nonseekable_open,
+	.write = fuse_conn_abort_write,
+};
+
+static const struct file_operations fuse_ctl_waiting_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_waiting_read,
+};
+
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+					  struct fuse_conn *fc,
+					  const char *name,
+					  int mode, int nlink,
+					  struct inode_operations *iop,
+					  const struct file_operations *fop)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+	dentry = d_alloc_name(parent, name);
+	if (!dentry)
+		return NULL;
+
+	fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+	inode = new_inode(fuse_control_sb);
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = mode;
+	inode->i_uid = fc->user_id;
+	inode->i_gid = fc->group_id;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	/* setting ->i_op to NULL is not allowed */
+	if (iop)
+		inode->i_op = iop;
+	inode->i_fop = fop;
+	inode->i_nlink = nlink;
+	inode->u.generic_ip = fc;
+	d_add(dentry, inode);
+	return dentry;
+}
+
+/*
+ * Add a connection to the control filesystem (if it exists).  Caller
+ * must host fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+	struct dentry *parent;
+	char name[32];
+
+	if (!fuse_control_sb)
+		return 0;
+
+	parent = fuse_control_sb->s_root;
+	parent->d_inode->i_nlink++;
+	sprintf(name, "%llu", (unsigned long long) fc->id);
+	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+				     &simple_dir_inode_operations,
+				     &simple_dir_operations);
+	if (!parent)
+		goto err;
+
+	if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+				NULL, &fuse_ctl_waiting_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+				 NULL, &fuse_ctl_abort_ops))
+		goto err;
+
+	return 0;
+
+ err:
+	fuse_ctl_remove_conn(fc);
+	return -ENOMEM;
+}
+
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must host fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+	int i;
+
+	if (!fuse_control_sb)
+		return;
+
+	for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+		struct dentry *dentry = fc->ctl_dentry[i];
+		dentry->d_inode->u.generic_ip = NULL;
+		d_drop(dentry);
+		dput(dentry);
+	}
+	fuse_control_sb->s_root->d_inode->i_nlink--;
+}
+
+static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct tree_descr empty_descr = {""};
+	struct fuse_conn *fc;
+	int err;
+
+	err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+	if (err)
+		return err;
+
+	mutex_lock(&fuse_mutex);
+	BUG_ON(fuse_control_sb);
+	fuse_control_sb = sb;
+	list_for_each_entry(fc, &fuse_conn_list, entry) {
+		err = fuse_ctl_add_conn(fc);
+		if (err) {
+			fuse_control_sb = NULL;
+			mutex_unlock(&fuse_mutex);
+			return err;
+		}
+	}
+	mutex_unlock(&fuse_mutex);
+
+	return 0;
+}
+
+static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *raw_data,
+			struct vfsmount *mnt)
+{
+	return get_sb_single(fs_type, flags, raw_data,
+				fuse_ctl_fill_super, mnt);
+}
+
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+	mutex_lock(&fuse_mutex);
+	fuse_control_sb = NULL;
+	mutex_unlock(&fuse_mutex);
+
+	kill_litter_super(sb);
+}
+
+static struct file_system_type fuse_ctl_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fusectl",
+	.get_sb		= fuse_ctl_get_sb,
+	.kill_sb	= fuse_ctl_kill_sb,
+};
+
+int __init fuse_ctl_init(void)
+{
+	return register_filesystem(&fuse_ctl_fs_type);
+}
+
+void fuse_ctl_cleanup(void)
+{
+	unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 104a62dadb94..1e2006caf158 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,6 +34,7 @@ static void fuse_request_init(struct fuse_req *req)
 {
 	memset(req, 0, sizeof(*req));
 	INIT_LIST_HEAD(&req->list);
+	INIT_LIST_HEAD(&req->intr_entry);
 	init_waitqueue_head(&req->waitq);
 	atomic_set(&req->count, 1);
 }
@@ -64,18 +65,6 @@ static void restore_sigs(sigset_t *oldset)
 	sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
-/*
- * Reset request, so that it can be reused
- *
- * The caller must be _very_ careful to make sure, that it is holding
- * the only reference to req
- */
-void fuse_reset_request(struct fuse_req *req)
-{
-	BUG_ON(atomic_read(&req->count) != 1);
-	fuse_request_init(req);
-}
-
 static void __fuse_get_request(struct fuse_req *req)
 {
 	atomic_inc(&req->count);
@@ -88,6 +77,13 @@ static void __fuse_put_request(struct fuse_req *req)
 	atomic_dec(&req->count);
 }
 
+static void fuse_req_init_context(struct fuse_req *req)
+{
+	req->in.h.uid = current->fsuid;
+	req->in.h.gid = current->fsgid;
+	req->in.h.pid = current->pid;
+}
+
 struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
 	struct fuse_req *req;
@@ -103,14 +99,16 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 	if (intr)
 		goto out;
 
+	err = -ENOTCONN;
+	if (!fc->connected)
+		goto out;
+
 	req = fuse_request_alloc();
 	err = -ENOMEM;
 	if (!req)
 		goto out;
 
-	req->in.h.uid = current->fsuid;
-	req->in.h.gid = current->fsgid;
-	req->in.h.pid = current->pid;
+	fuse_req_init_context(req);
 	req->waiting = 1;
 	return req;
 
@@ -119,142 +117,183 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 	return ERR_PTR(err);
 }
 
-void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+/*
+ * Return request in fuse_file->reserved_req.  However that may
+ * currently be in use.  If that is the case, wait for it to become
+ * available.
+ */
+static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
+					 struct file *file)
 {
-	if (atomic_dec_and_test(&req->count)) {
-		if (req->waiting)
-			atomic_dec(&fc->num_waiting);
-		fuse_request_free(req);
-	}
+	struct fuse_req *req = NULL;
+	struct fuse_file *ff = file->private_data;
+
+	do {
+		wait_event(fc->blocked_waitq, ff->reserved_req);
+		spin_lock(&fc->lock);
+		if (ff->reserved_req) {
+			req = ff->reserved_req;
+			ff->reserved_req = NULL;
+			get_file(file);
+			req->stolen_file = file;
+		}
+		spin_unlock(&fc->lock);
+	} while (!req);
+
+	return req;
 }
 
 /*
- * Called with sbput_sem held for read (request_end) or write
- * (fuse_put_super).  By the time fuse_put_super() is finished, all
- * inodes belonging to background requests must be released, so the
- * iputs have to be done within the locked region.
+ * Put stolen request back into fuse_file->reserved_req
  */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
+static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
 {
-	iput(req->inode);
-	iput(req->inode2);
+	struct file *file = req->stolen_file;
+	struct fuse_file *ff = file->private_data;
+
 	spin_lock(&fc->lock);
-	list_del(&req->bg_entry);
-	if (fc->num_background == FUSE_MAX_BACKGROUND) {
-		fc->blocked = 0;
-		wake_up_all(&fc->blocked_waitq);
-	}
-	fc->num_background--;
+	fuse_request_init(req);
+	BUG_ON(ff->reserved_req);
+	ff->reserved_req = req;
+	wake_up(&fc->blocked_waitq);
 	spin_unlock(&fc->lock);
+	fput(file);
 }
 
 /*
- * This function is called when a request is finished.  Either a reply
- * has arrived or it was interrupted (and not yet sent) or some error
- * occurred during communication with userspace, or the device file
- * was closed.  In case of a background request the reference to the
- * stored objects are released.  The requester thread is woken up (if
- * still waiting), the 'end' callback is called if given, else the
- * reference to the request is released
+ * Gets a requests for a file operation, always succeeds
  *
- * Releasing extra reference for foreground requests must be done
- * within the same locked region as setting state to finished.  This
- * is because fuse_reset_request() may be called after request is
- * finished and it must be the sole possessor.  If request is
- * interrupted and put in the background, it will return with an error
- * and hence never be reset and reused.
+ * This is used for sending the FLUSH request, which must get to
+ * userspace, due to POSIX locks which may need to be unlocked.
  *
- * Called with fc->lock, unlocks it
+ * If allocation fails due to OOM, use the reserved request in
+ * fuse_file.
+ *
+ * This is very unlikely to deadlock accidentally, since the
+ * filesystem should not have it's own file open.  If deadlock is
+ * intentional, it can still be broken by "aborting" the filesystem.
  */
-static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
 {
-	list_del(&req->list);
-	req->state = FUSE_REQ_FINISHED;
-	if (!req->background) {
-		spin_unlock(&fc->lock);
-		wake_up(&req->waitq);
-		fuse_put_request(fc, req);
-	} else {
-		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-		req->end = NULL;
-		spin_unlock(&fc->lock);
-		down_read(&fc->sbput_sem);
-		if (fc->mounted)
-			fuse_release_background(fc, req);
-		up_read(&fc->sbput_sem);
+	struct fuse_req *req;
 
-		/* fput must go outside sbput_sem, otherwise it can deadlock */
-		if (req->file)
-			fput(req->file);
+	atomic_inc(&fc->num_waiting);
+	wait_event(fc->blocked_waitq, !fc->blocked);
+	req = fuse_request_alloc();
+	if (!req)
+		req = get_reserved_req(fc, file);
 
-		if (end)
-			end(fc, req);
+	fuse_req_init_context(req);
+	req->waiting = 1;
+	return req;
+}
+
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (atomic_dec_and_test(&req->count)) {
+		if (req->waiting)
+			atomic_dec(&fc->num_waiting);
+
+		if (req->stolen_file)
+			put_reserved_req(fc, req);
 		else
-			fuse_put_request(fc, req);
+			fuse_request_free(req);
 	}
 }
 
 /*
- * Unfortunately request interruption not just solves the deadlock
- * problem, it causes problems too.  These stem from the fact, that an
- * interrupted request is continued to be processed in userspace,
- * while all the locks and object references (inode and file) held
- * during the operation are released.
- *
- * To release the locks is exactly why there's a need to interrupt the
- * request, so there's not a lot that can be done about this, except
- * introduce additional locking in userspace.
- *
- * More important is to keep inode and file references until userspace
- * has replied, otherwise FORGET and RELEASE could be sent while the
- * inode/file is still used by the filesystem.
- *
- * For this reason the concept of "background" request is introduced.
- * An interrupted request is backgrounded if it has been already sent
- * to userspace.  Backgrounding involves getting an extra reference to
- * inode(s) or file used in the request, and adding the request to
- * fc->background list.  When a reply is received for a background
- * request, the object references are released, and the request is
- * removed from the list.  If the filesystem is unmounted while there
- * are still background requests, the list is walked and references
- * are released as if a reply was received.
+ * This function is called when a request is finished.  Either a reply
+ * has arrived or it was aborted (and not yet sent) or some error
+ * occurred during communication with userspace, or the device file
+ * was closed.  The requester thread is woken up (if still waiting),
+ * the 'end' callback is called if given, else the reference to the
+ * request is released
  *
- * There's one more use for a background request.  The RELEASE message is
- * always sent as background, since it doesn't return an error or
- * data.
+ * Called with fc->lock, unlocks it
  */
-static void background_request(struct fuse_conn *fc, struct fuse_req *req)
-{
-	req->background = 1;
-	list_add(&req->bg_entry, &fc->background);
-	fc->num_background++;
-	if (fc->num_background == FUSE_MAX_BACKGROUND)
-		fc->blocked = 1;
-	if (req->inode)
-		req->inode = igrab(req->inode);
-	if (req->inode2)
-		req->inode2 = igrab(req->inode2);
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+	req->end = NULL;
+	list_del(&req->list);
+	list_del(&req->intr_entry);
+	req->state = FUSE_REQ_FINISHED;
+	if (req->background) {
+		if (fc->num_background == FUSE_MAX_BACKGROUND) {
+			fc->blocked = 0;
+			wake_up_all(&fc->blocked_waitq);
+		}
+		fc->num_background--;
+	}
+	spin_unlock(&fc->lock);
+	dput(req->dentry);
+	mntput(req->vfsmount);
 	if (req->file)
-		get_file(req->file);
+		fput(req->file);
+	wake_up(&req->waitq);
+	if (end)
+		end(fc, req);
+	else
+		fuse_put_request(fc, req);
 }
 
-/* Called with fc->lock held.  Releases, and then reacquires it. */
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+static void wait_answer_interruptible(struct fuse_conn *fc,
+				      struct fuse_req *req)
 {
-	sigset_t oldset;
+	if (signal_pending(current))
+		return;
 
 	spin_unlock(&fc->lock);
-	block_sigs(&oldset);
 	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
-	restore_sigs(&oldset);
 	spin_lock(&fc->lock);
-	if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
-		return;
+}
+
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+	list_add_tail(&req->intr_entry, &fc->interrupts);
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
+/* Called with fc->lock held.  Releases, and then reacquires it. */
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (!fc->no_interrupt) {
+		/* Any signal may interrupt this */
+		wait_answer_interruptible(fc, req);
+
+		if (req->aborted)
+			goto aborted;
+		if (req->state == FUSE_REQ_FINISHED)
+			return;
 
-	if (!req->interrupted) {
-		req->out.h.error = -EINTR;
 		req->interrupted = 1;
+		if (req->state == FUSE_REQ_SENT)
+			queue_interrupt(fc, req);
+	}
+
+	if (req->force) {
+		spin_unlock(&fc->lock);
+		wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+		spin_lock(&fc->lock);
+	} else {
+		sigset_t oldset;
+
+		/* Only fatal signals may interrupt this */
+		block_sigs(&oldset);
+		wait_answer_interruptible(fc, req);
+		restore_sigs(&oldset);
 	}
+
+	if (req->aborted)
+		goto aborted;
+	if (req->state == FUSE_REQ_FINISHED)
+ 		return;
+
+	req->out.h.error = -EINTR;
+	req->aborted = 1;
+
+ aborted:
 	if (req->locked) {
 		/* This is uninterruptible sleep, because data is
 		   being copied to/from the buffers of req.  During
@@ -268,8 +307,11 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 	if (req->state == FUSE_REQ_PENDING) {
 		list_del(&req->list);
 		__fuse_put_request(req);
-	} else if (req->state == FUSE_REQ_SENT)
-		background_request(fc, req);
+	} else if (req->state == FUSE_REQ_SENT) {
+		spin_unlock(&fc->lock);
+		wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+		spin_lock(&fc->lock);
+	}
 }
 
 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
@@ -283,13 +325,19 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
 	return nbytes;
 }
 
+static u64 fuse_get_unique(struct fuse_conn *fc)
+ {
+ 	fc->reqctr++;
+ 	/* zero is special */
+ 	if (fc->reqctr == 0)
+ 		fc->reqctr = 1;
+
+	return fc->reqctr;
+}
+
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	fc->reqctr++;
-	/* zero is special */
-	if (fc->reqctr == 0)
-		fc->reqctr = 1;
-	req->in.h.unique = fc->reqctr;
+	req->in.h.unique = fuse_get_unique(fc);
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
@@ -302,9 +350,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
-/*
- * This can only be interrupted by a SIGKILL
- */
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
@@ -327,8 +372,12 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fc->lock);
-	background_request(fc, req);
 	if (fc->connected) {
+		req->background = 1;
+		fc->num_background++;
+		if (fc->num_background == FUSE_MAX_BACKGROUND)
+			fc->blocked = 1;
+
 		queue_request(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
@@ -352,14 +401,14 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 /*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
- * interrupted bail out.
+ * aborted bail out.
  */
 static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int err = 0;
 	if (req) {
 		spin_lock(&fc->lock);
-		if (req->interrupted)
+		if (req->aborted)
 			err = -ENOENT;
 		else
 			req->locked = 1;
@@ -369,7 +418,7 @@ static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 }
 
 /*
- * Unlock request.  If it was interrupted during being locked, the
+ * Unlock request.  If it was aborted during being locked, the
  * requester thread is currently waiting for it to be unlocked, so
  * wake it up.
  */
@@ -378,7 +427,7 @@ static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
 	if (req) {
 		spin_lock(&fc->lock);
 		req->locked = 0;
-		if (req->interrupted)
+		if (req->aborted)
 			wake_up(&req->waitq);
 		spin_unlock(&fc->lock);
 	}
@@ -557,13 +606,18 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
 	return err;
 }
 
+static int request_pending(struct fuse_conn *fc)
+{
+	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+}
+
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue_exclusive(&fc->waitq, &wait);
-	while (fc->connected && list_empty(&fc->pending)) {
+	while (fc->connected && !request_pending(fc)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (signal_pending(current))
 			break;
@@ -577,11 +631,50 @@ static void request_wait(struct fuse_conn *fc)
 }
 
 /*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+			       const struct iovec *iov, unsigned long nr_segs)
+{
+	struct fuse_copy_state cs;
+	struct fuse_in_header ih;
+	struct fuse_interrupt_in arg;
+	unsigned reqsize = sizeof(ih) + sizeof(arg);
+	int err;
+
+	list_del_init(&req->intr_entry);
+	req->intr_unique = fuse_get_unique(fc);
+	memset(&ih, 0, sizeof(ih));
+	memset(&arg, 0, sizeof(arg));
+	ih.len = reqsize;
+	ih.opcode = FUSE_INTERRUPT;
+	ih.unique = req->intr_unique;
+	arg.unique = req->in.h.unique;
+
+	spin_unlock(&fc->lock);
+	if (iov_length(iov, nr_segs) < reqsize)
+		return -EINVAL;
+
+	fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+	err = fuse_copy_one(&cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(&cs, &arg, sizeof(arg));
+	fuse_copy_finish(&cs);
+
+	return err ? err : reqsize;
+}
+
+/*
  * Read a single request into the userspace filesystem's buffer.  This
  * function waits until a request is available, then removes it from
  * the pending list and copies request data to userspace buffer.  If
- * no reply is needed (FORGET) or request has been interrupted or
- * there was an error during the copying then it's finished by calling
+ * no reply is needed (FORGET) or request has been aborted or there
+ * was an error during the copying then it's finished by calling
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
@@ -601,7 +694,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	spin_lock(&fc->lock);
 	err = -EAGAIN;
 	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-	    list_empty(&fc->pending))
+	    !request_pending(fc))
 		goto err_unlock;
 
 	request_wait(fc);
@@ -609,9 +702,15 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	if (!fc->connected)
 		goto err_unlock;
 	err = -ERESTARTSYS;
-	if (list_empty(&fc->pending))
+	if (!request_pending(fc))
 		goto err_unlock;
 
+	if (!list_empty(&fc->interrupts)) {
+		req = list_entry(fc->interrupts.next, struct fuse_req,
+				 intr_entry);
+		return fuse_read_interrupt(fc, req, iov, nr_segs);
+	}
+
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	req->state = FUSE_REQ_READING;
 	list_move(&req->list, &fc->io);
@@ -636,10 +735,10 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	fuse_copy_finish(&cs);
 	spin_lock(&fc->lock);
 	req->locked = 0;
-	if (!err && req->interrupted)
+	if (!err && req->aborted)
 		err = -ENOENT;
 	if (err) {
-		if (!req->interrupted)
+		if (!req->aborted)
 			req->out.h.error = -EIO;
 		request_end(fc, req);
 		return err;
@@ -649,6 +748,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	else {
 		req->state = FUSE_REQ_SENT;
 		list_move_tail(&req->list, &fc->processing);
+		if (req->interrupted)
+			queue_interrupt(fc, req);
 		spin_unlock(&fc->lock);
 	}
 	return reqsize;
@@ -675,7 +776,7 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 	list_for_each(entry, &fc->processing) {
 		struct fuse_req *req;
 		req = list_entry(entry, struct fuse_req, list);
-		if (req->in.h.unique == unique)
+		if (req->in.h.unique == unique || req->intr_unique == unique)
 			return req;
 	}
 	return NULL;
@@ -741,17 +842,33 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		goto err_unlock;
 
 	req = request_find(fc, oh.unique);
-	err = -EINVAL;
 	if (!req)
 		goto err_unlock;
 
-	if (req->interrupted) {
+	if (req->aborted) {
 		spin_unlock(&fc->lock);
 		fuse_copy_finish(&cs);
 		spin_lock(&fc->lock);
 		request_end(fc, req);
 		return -ENOENT;
 	}
+	/* Is it an interrupt reply? */
+	if (req->intr_unique == oh.unique) {
+		err = -EINVAL;
+		if (nbytes != sizeof(struct fuse_out_header))
+			goto err_unlock;
+
+		if (oh.error == -ENOSYS)
+			fc->no_interrupt = 1;
+		else if (oh.error == -EAGAIN)
+			queue_interrupt(fc, req);
+
+		spin_unlock(&fc->lock);
+		fuse_copy_finish(&cs);
+		return nbytes;
+	}
+
+	req->state = FUSE_REQ_WRITING;
 	list_move(&req->list, &fc->io);
 	req->out.h = oh;
 	req->locked = 1;
@@ -764,9 +881,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err) {
-		if (req->interrupted)
+		if (req->aborted)
 			err = -ENOENT;
-	} else if (!req->interrupted)
+	} else if (!req->aborted)
 		req->out.h.error = -EIO;
 	request_end(fc, req);
 
@@ -800,7 +917,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 	spin_lock(&fc->lock);
 	if (!fc->connected)
 		mask = POLLERR;
-	else if (!list_empty(&fc->pending))
+	else if (request_pending(fc))
 		mask |= POLLIN | POLLRDNORM;
 	spin_unlock(&fc->lock);
 
@@ -826,7 +943,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 /*
  * Abort requests under I/O
  *
- * The requests are set to interrupted and finished, and the request
+ * The requests are set to aborted and finished, and the request
  * waiter is woken up.  This will make request_wait_answer() wait
  * until the request is unlocked and then return.
  *
@@ -841,7 +958,7 @@ static void end_io_requests(struct fuse_conn *fc)
 			list_entry(fc->io.next, struct fuse_req, list);
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 
-		req->interrupted = 1;
+		req->aborted = 1;
 		req->out.h.error = -ECONNABORTED;
 		req->state = FUSE_REQ_FINISHED;
 		list_del_init(&req->list);
@@ -874,19 +991,20 @@ static void end_io_requests(struct fuse_conn *fc)
  * onto the pending list is prevented by req->connected being false.
  *
  * Progression of requests under I/O to the processing list is
- * prevented by the req->interrupted flag being true for these
- * requests.  For this reason requests on the io list must be aborted
- * first.
+ * prevented by the req->aborted flag being true for these requests.
+ * For this reason requests on the io list must be aborted first.
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
 	spin_lock(&fc->lock);
 	if (fc->connected) {
 		fc->connected = 0;
+		fc->blocked = 0;
 		end_io_requests(fc);
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		wake_up_all(&fc->waitq);
+		wake_up_all(&fc->blocked_waitq);
 		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	}
 	spin_unlock(&fc->lock);
@@ -902,7 +1020,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 		end_requests(fc, &fc->processing);
 		spin_unlock(&fc->lock);
 		fasync_helper(-1, file, 0, &fc->fasync);
-		kobject_put(&fc->kobj);
+		fuse_conn_put(fc);
 	}
 
 	return 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8d7546e832e8..72a74cde6de8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -79,7 +79,6 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 {
 	req->in.h.opcode = FUSE_LOOKUP;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
@@ -225,6 +224,20 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 }
 
 /*
+ * Synchronous release for the case when something goes wrong in CREATE_OPEN
+ */
+static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
+			      u64 nodeid, int flags)
+{
+	struct fuse_req *req;
+
+	req = fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
+	req->force = 1;
+	request_send(fc, req);
+	fuse_put_request(fc, req);
+}
+
+/*
  * Atomic create+open operation
  *
  * If the filesystem doesn't support this, then fall back to separate
@@ -237,6 +250,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct inode *inode;
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req;
+	struct fuse_req *forget_req;
 	struct fuse_open_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
@@ -247,9 +261,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (fc->no_create)
 		return -ENOSYS;
 
+	forget_req = fuse_get_req(fc);
+	if (IS_ERR(forget_req))
+		return PTR_ERR(forget_req);
+
 	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
 	if (IS_ERR(req))
-		return PTR_ERR(req);
+		goto out_put_forget_req;
 
 	err = -ENOMEM;
 	ff = fuse_file_alloc();
@@ -262,7 +281,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	inarg.mode = mode;
 	req->in.h.opcode = FUSE_CREATE;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -285,25 +303,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
 		goto out_free_ff;
 
+	fuse_put_request(fc, req);
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
 			  &outentry.attr);
-	err = -ENOMEM;
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		ff->fh = outopen.fh;
-		/* Special release, with inode = NULL, this will
-		   trigger a 'forget' request when the release is
-		   complete */
-		fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
-		goto out_put_request;
+		fuse_sync_release(fc, ff, outentry.nodeid, flags);
+		fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+		return -ENOMEM;
 	}
-	fuse_put_request(fc, req);
+	fuse_put_request(fc, forget_req);
 	d_instantiate(entry, inode);
 	fuse_change_timeout(entry, &outentry);
 	file = lookup_instantiate_filp(nd, entry, generic_file_open);
 	if (IS_ERR(file)) {
 		ff->fh = outopen.fh;
-		fuse_send_release(fc, ff, outentry.nodeid, inode, flags, 0);
+		fuse_sync_release(fc, ff, outentry.nodeid, flags);
 		return PTR_ERR(file);
 	}
 	fuse_finish_open(inode, file, ff, &outopen);
@@ -313,6 +329,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	fuse_file_free(ff);
  out_put_request:
 	fuse_put_request(fc, req);
+ out_put_forget_req:
+	fuse_put_request(fc, forget_req);
 	return err;
 }
 
@@ -328,7 +346,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	int err;
 
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
@@ -448,7 +465,6 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 
 	req->in.h.opcode = FUSE_UNLINK;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
@@ -480,7 +496,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
@@ -510,8 +525,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	inarg.newdir = get_node_id(newdir);
 	req->in.h.opcode = FUSE_RENAME;
 	req->in.h.nodeid = get_node_id(olddir);
-	req->inode = olddir;
-	req->inode2 = newdir;
 	req->in.numargs = 3;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -558,7 +571,6 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
 	req->in.h.opcode = FUSE_LINK;
-	req->inode2 = inode;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -587,7 +599,6 @@ int fuse_do_getattr(struct inode *inode)
 
 	req->in.h.opcode = FUSE_GETATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(arg);
 	req->out.args[0].value = &arg;
@@ -679,7 +690,6 @@ static int fuse_access(struct inode *inode, int mask)
 	inarg.mask = mask;
 	req->in.h.opcode = FUSE_ACCESS;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -820,7 +830,6 @@ static char *read_link(struct dentry *dentry)
 	}
 	req->in.h.opcode = FUSE_READLINK;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->out.argvar = 1;
 	req->out.numargs = 1;
 	req->out.args[0].size = PAGE_SIZE - 1;
@@ -939,7 +948,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	iattr_to_fattr(attr, &inarg);
 	req->in.h.opcode = FUSE_SETATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1002,7 +1010,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	inarg.flags = flags;
 	req->in.h.opcode = FUSE_SETXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 3;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1041,7 +1048,6 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 	inarg.size = size;
 	req->in.h.opcode = FUSE_GETXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1091,7 +1097,6 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 	inarg.size = size;
 	req->in.h.opcode = FUSE_LISTXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1135,7 +1140,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 
 	req->in.h.opcode = FUSE_REMOVEXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = strlen(name) + 1;
 	req->in.args[0].value = name;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fc342cf7c2cc..28aa81eae2cc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -30,7 +30,6 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 	req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -49,8 +48,8 @@ struct fuse_file *fuse_file_alloc(void)
 	struct fuse_file *ff;
 	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
 	if (ff) {
-		ff->release_req = fuse_request_alloc();
-		if (!ff->release_req) {
+		ff->reserved_req = fuse_request_alloc();
+		if (!ff->reserved_req) {
 			kfree(ff);
 			ff = NULL;
 		}
@@ -60,7 +59,7 @@ struct fuse_file *fuse_file_alloc(void)
 
 void fuse_file_free(struct fuse_file *ff)
 {
-	fuse_request_free(ff->release_req);
+	fuse_request_free(ff->reserved_req);
 	kfree(ff);
 }
 
@@ -113,37 +112,22 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	return err;
 }
 
-/* Special case for failed iget in CREATE */
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
+				   int opcode)
 {
-	/* If called from end_io_requests(), req has more than one
-	   reference and fuse_reset_request() cannot work */
-	if (fc->connected) {
-		u64 nodeid = req->in.h.nodeid;
-		fuse_reset_request(req);
-		fuse_send_forget(fc, req, nodeid, 1);
-	} else
-		fuse_put_request(fc, req);
-}
-
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-		       u64 nodeid, struct inode *inode, int flags, int isdir)
-{
-	struct fuse_req * req = ff->release_req;
+	struct fuse_req *req = ff->reserved_req;
 	struct fuse_release_in *inarg = &req->misc.release_in;
 
 	inarg->fh = ff->fh;
 	inarg->flags = flags;
-	req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+	req->in.h.opcode = opcode;
 	req->in.h.nodeid = nodeid;
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(struct fuse_release_in);
 	req->in.args[0].value = inarg;
-	request_send_background(fc, req);
-	if (!inode)
-		req->end = fuse_release_end;
 	kfree(ff);
+
+	return req;
 }
 
 int fuse_release_common(struct inode *inode, struct file *file, int isdir)
@@ -151,8 +135,15 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
 	struct fuse_file *ff = file->private_data;
 	if (ff) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
-		u64 nodeid = get_node_id(inode);
-		fuse_send_release(fc, ff, nodeid, inode, file->f_flags, isdir);
+		struct fuse_req *req;
+
+		req = fuse_release_fill(ff, get_node_id(inode), file->f_flags,
+					isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
+
+		/* Hold vfsmount and dentry until release is finished */
+		req->vfsmount = mntget(file->f_vfsmnt);
+		req->dentry = dget(file->f_dentry);
+		request_send_background(fc, req);
 	}
 
 	/* Return value is ignored by VFS */
@@ -169,7 +160,29 @@ static int fuse_release(struct inode *inode, struct file *file)
 	return fuse_release_common(inode, file, 0);
 }
 
-static int fuse_flush(struct file *file)
+/*
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
+ */
+static u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
+{
+	u32 *k = fc->scramble_key;
+	u64 v = (unsigned long) id;
+	u32 v0 = v;
+	u32 v1 = v >> 32;
+	u32 sum = 0;
+	int i;
+
+	for (i = 0; i < 32; i++) {
+		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+		sum += 0x9E3779B9;
+		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+	}
+
+	return (u64) v0 + ((u64) v1 << 32);
+}
+
+static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -184,19 +197,16 @@ static int fuse_flush(struct file *file)
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_req(fc);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
+	req = fuse_get_req_nofail(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
+	inarg.lock_owner = fuse_lock_owner_id(fc, id);
 	req->in.h.opcode = FUSE_FLUSH;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
+	req->force = 1;
 	request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -232,8 +242,6 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	inarg.fsync_flags = datasync ? 1 : 0;
 	req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -266,8 +274,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 	inarg->size = count;
 	req->in.h.opcode = opcode;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(struct fuse_read_in);
 	req->in.args[0].value = inarg;
@@ -342,6 +348,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 	req->out.page_zeroing = 1;
 	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
 	if (fc->async_read) {
+		get_file(file);
+		req->file = file;
 		req->end = fuse_readpages_end;
 		request_send_background(fc, req);
 	} else {
@@ -420,8 +428,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 	inarg.size = count;
 	req->in.h.opcode = FUSE_WRITE;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.argpages = 1;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -619,6 +625,126 @@ static int fuse_set_page_dirty(struct page *page)
 	return 0;
 }
 
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+				  struct file_lock *fl)
+{
+	switch (ffl->type) {
+	case F_UNLCK:
+		break;
+
+	case F_RDLCK:
+	case F_WRLCK:
+		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+		    ffl->end < ffl->start)
+			return -EIO;
+
+		fl->fl_start = ffl->start;
+		fl->fl_end = ffl->end;
+		fl->fl_pid = ffl->pid;
+		break;
+
+	default:
+		return -EIO;
+	}
+	fl->fl_type = ffl->type;
+	return 0;
+}
+
+static void fuse_lk_fill(struct fuse_req *req, struct file *file,
+			 const struct file_lock *fl, int opcode, pid_t pid)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_lk_in *arg = &req->misc.lk_in;
+
+	arg->fh = ff->fh;
+	arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+	arg->lk.start = fl->fl_start;
+	arg->lk.end = fl->fl_end;
+	arg->lk.type = fl->fl_type;
+	arg->lk.pid = pid;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+}
+
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_lk_out outarg;
+	int err;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, FUSE_GETLK, 0);
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		err = convert_fuse_file_lock(&outarg.lk, fl);
+
+	return err;
+}
+
+static int fuse_setlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+	int err;
+
+	/* Unlock on close is handled by the flush method */
+	if (fl->fl_flags & FL_CLOSE)
+		return 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, opcode, pid);
+	request_send(fc, req);
+	err = req->out.h.error;
+	/* locking is restartable */
+	if (err == -EINTR)
+		err = -ERESTARTSYS;
+	fuse_put_request(fc, req);
+	return err;
+}
+
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (cmd == F_GETLK) {
+		if (fc->no_lock) {
+			if (!posix_test_lock(file, fl, fl))
+				fl->fl_type = F_UNLCK;
+			err = 0;
+		} else
+			err = fuse_getlk(file, fl);
+	} else {
+		if (fc->no_lock)
+			err = posix_lock_file_wait(file, fl);
+		else
+			err = fuse_setlk(file, fl);
+	}
+	return err;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
@@ -628,6 +754,7 @@ static const struct file_operations fuse_file_operations = {
 	.flush		= fuse_flush,
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
 	.sendfile	= generic_file_sendfile,
 };
 
@@ -639,6 +766,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.flush		= fuse_flush,
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
 	/* no mmap and sendfile */
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0474202cb5dc..0dbf96621841 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -8,12 +8,13 @@
 
 #include <linux/fuse.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/wait.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -24,6 +25,9 @@
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 3
+
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
     permission checking is done in the kernel */
@@ -33,6 +37,11 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
 
 /** FUSE inode */
 struct fuse_inode {
@@ -56,7 +65,7 @@ struct fuse_inode {
 /** FUSE specific file data */
 struct fuse_file {
 	/** Request reserved for flush and release */
-	struct fuse_req *release_req;
+	struct fuse_req *reserved_req;
 
 	/** File handle used by userspace */
 	u64 fh;
@@ -122,6 +131,7 @@ enum fuse_req_state {
 	FUSE_REQ_PENDING,
 	FUSE_REQ_READING,
 	FUSE_REQ_SENT,
+	FUSE_REQ_WRITING,
 	FUSE_REQ_FINISHED
 };
 
@@ -135,12 +145,15 @@ struct fuse_req {
 	    fuse_conn */
 	struct list_head list;
 
-	/** Entry on the background list */
-	struct list_head bg_entry;
+	/** Entry on the interrupts list  */
+	struct list_head intr_entry;
 
 	/** refcount */
 	atomic_t count;
 
+	/** Unique ID for the interrupt request */
+	u64 intr_unique;
+
 	/*
 	 * The following bitfields are either set once before the
 	 * request is queued or setting/clearing them is protected by
@@ -150,12 +163,18 @@ struct fuse_req {
 	/** True if the request has reply */
 	unsigned isreply:1;
 
-	/** The request was interrupted */
-	unsigned interrupted:1;
+	/** Force sending of the request even if interrupted */
+	unsigned force:1;
+
+	/** The request was aborted */
+	unsigned aborted:1;
 
 	/** Request is sent in the background */
 	unsigned background:1;
 
+	/** The request has been interrupted */
+	unsigned interrupted:1;
+
 	/** Data is being copied to/from the request */
 	unsigned locked:1;
 
@@ -181,6 +200,7 @@ struct fuse_req {
 		struct fuse_init_in init_in;
 		struct fuse_init_out init_out;
 		struct fuse_read_in read_in;
+		struct fuse_lk_in lk_in;
 	} misc;
 
 	/** page vector */
@@ -192,17 +212,20 @@ struct fuse_req {
 	/** offset of data on first page */
 	unsigned page_offset;
 
-	/** Inode used in the request */
-	struct inode *inode;
-
-	/** Second inode used in the request (or NULL) */
-	struct inode *inode2;
-
 	/** File used in the request (or NULL) */
 	struct file *file;
 
+	/** vfsmount used in release */
+	struct vfsmount *vfsmount;
+
+	/** dentry used in release */
+	struct dentry *dentry;
+
 	/** Request completion callback */
 	void (*end)(struct fuse_conn *, struct fuse_req *);
+
+	/** Request is stolen from fuse_file->reserved_req */
+	struct file *stolen_file;
 };
 
 /**
@@ -216,6 +239,9 @@ struct fuse_conn {
 	/** Lock protecting accessess to  members of this structure */
 	spinlock_t lock;
 
+	/** Refcount */
+	atomic_t count;
+
 	/** The user id for this mount */
 	uid_t user_id;
 
@@ -243,13 +269,12 @@ struct fuse_conn {
 	/** The list of requests under I/O */
 	struct list_head io;
 
-	/** Requests put in the background (RELEASE or any other
-	    interrupted request) */
-	struct list_head background;
-
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
+	/** Pending interrupts */
+	struct list_head interrupts;
+
 	/** Flag indicating if connection is blocked.  This will be
 	    the case before the INIT reply is received, and if there
 	    are too many outstading backgrounds requests */
@@ -258,15 +283,9 @@ struct fuse_conn {
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
-	/** RW semaphore for exclusion with fuse_put_super() */
-	struct rw_semaphore sbput_sem;
-
 	/** The next unique request id */
 	u64 reqctr;
 
-	/** Mount is active */
-	unsigned mounted;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -305,12 +324,18 @@ struct fuse_conn {
 	/** Is removexattr not implemented by fs? */
 	unsigned no_removexattr : 1;
 
+	/** Are file locking primitives not implemented by fs? */
+	unsigned no_lock : 1;
+
 	/** Is access not implemented by fs? */
 	unsigned no_access : 1;
 
 	/** Is create not implemented by fs? */
 	unsigned no_create : 1;
 
+	/** Is interrupt not implemented by fs? */
+	unsigned no_interrupt : 1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -320,11 +345,23 @@ struct fuse_conn {
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 
-	/** kobject */
-	struct kobject kobj;
+	/** Entry on the fuse_conn_list */
+	struct list_head entry;
+
+	/** Unique ID */
+	u64 id;
+
+	/** Dentries in the control filesystem */
+	struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+
+	/** number of dentries used in the above array */
+	int ctl_ndents;
 
 	/** O_ASYNC requests */
 	struct fasync_struct *fasync;
+
+	/** Key for lock owner ID scrambling */
+	u32 scramble_key[4];
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -337,11 +374,6 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
 	return get_fuse_conn_super(inode->i_sb);
 }
 
-static inline struct fuse_conn *get_fuse_conn_kobj(struct kobject *obj)
-{
-	return container_of(obj, struct fuse_conn, kobj);
-}
-
 static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
 {
 	return container_of(inode, struct fuse_inode, inode);
@@ -383,12 +415,9 @@ void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
 		      struct fuse_file *ff, struct fuse_open_out *outarg);
 
-/**
- * Send a RELEASE request
- */
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-		       u64 nodeid, struct inode *inode, int flags, int isdir);
-
+/** */
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
+				   int opcode);
 /**
  * Send RELEASE or RELEASEDIR request
  */
@@ -435,6 +464,9 @@ int fuse_dev_init(void);
  */
 void fuse_dev_cleanup(void);
 
+int fuse_ctl_init(void);
+void fuse_ctl_cleanup(void);
+
 /**
  * Allocate a request
  */
@@ -446,14 +478,14 @@ struct fuse_req *fuse_request_alloc(void);
 void fuse_request_free(struct fuse_req *req);
 
 /**
- * Reinitialize a request, the preallocated flag is left unmodified
+ * Get a request, may fail with -ENOMEM
  */
-void fuse_reset_request(struct fuse_req *req);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
 
 /**
- * Reserve a preallocated request
+ * Gets a requests for a file operation, always succeeds
  */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
 
 /**
  * Decrement reference count of a request.  If count goes to zero free
@@ -476,11 +508,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
  */
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
-/**
- * Release inodes and file associated with background request
- */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
-
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
@@ -493,3 +520,23 @@ int fuse_do_getattr(struct inode *inode);
  * Invalidate inode attributes
  */
 void fuse_invalidate_attr(struct inode *inode);
+
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b2..dcaaabd3b9c4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -11,25 +11,20 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/file.h>
-#include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
+#include <linux/random.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 
 static kmem_cache_t *fuse_inode_cachep;
-static struct subsystem connections_subsys;
-
-struct fuse_conn_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct fuse_conn *, char *);
-	ssize_t (*store)(struct fuse_conn *, const char *, size_t);
-};
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
 
 #define FUSE_SUPER_MAGIC 0x65735546
 
@@ -104,6 +99,14 @@ static void fuse_clear_inode(struct inode *inode)
 	}
 }
 
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	if (*flags & MS_MANDLOCK)
+		return -EINVAL;
+
+	return 0;
+}
+
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 {
 	if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
@@ -195,31 +198,29 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 	return inode;
 }
 
-static void fuse_umount_begin(struct super_block *sb)
+static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	fuse_abort_conn(get_fuse_conn_super(sb));
+	if (flags & MNT_FORCE)
+		fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
 }
 
 static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
-	down_write(&fc->sbput_sem);
-	while (!list_empty(&fc->background))
-		fuse_release_background(fc,
-					list_entry(fc->background.next,
-						   struct fuse_req, bg_entry));
-
 	spin_lock(&fc->lock);
-	fc->mounted = 0;
 	fc->connected = 0;
+	fc->blocked = 0;
 	spin_unlock(&fc->lock);
-	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
-	kobject_del(&fc->kobj);
-	kobject_put(&fc->kobj);
+	wake_up_all(&fc->blocked_waitq);
+	mutex_lock(&fuse_mutex);
+	list_del(&fc->entry);
+	fuse_ctl_remove_conn(fc);
+	mutex_unlock(&fuse_mutex);
+	fuse_conn_put(fc);
 }
 
 static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -236,8 +237,9 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 	/* fsid is left zero */
 }
 
-static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
 	struct fuse_statfs_out outarg;
@@ -368,11 +370,6 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
-static void fuse_conn_release(struct kobject *kobj)
-{
-	kfree(get_fuse_conn_kobj(kobj));
-}
-
 static struct fuse_conn *new_conn(void)
 {
 	struct fuse_conn *fc;
@@ -380,24 +377,35 @@ static struct fuse_conn *new_conn(void)
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
 		spin_lock_init(&fc->lock);
+		atomic_set(&fc->count, 1);
 		init_waitqueue_head(&fc->waitq);
 		init_waitqueue_head(&fc->blocked_waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
-		INIT_LIST_HEAD(&fc->background);
-		init_rwsem(&fc->sbput_sem);
-		kobj_set_kset_s(fc, connections_subsys);
-		kobject_init(&fc->kobj);
+		INIT_LIST_HEAD(&fc->interrupts);
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
 		fc->reqctr = 0;
 		fc->blocked = 1;
+		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 	}
 	return fc;
 }
 
+void fuse_conn_put(struct fuse_conn *fc)
+{
+	if (atomic_dec_and_test(&fc->count))
+		kfree(fc);
+}
+
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+	atomic_inc(&fc->count);
+	return fc;
+}
+
 static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 {
 	struct fuse_attr attr;
@@ -413,6 +421,7 @@ static struct super_operations fuse_super_operations = {
 	.destroy_inode  = fuse_destroy_inode,
 	.read_inode	= fuse_read_inode,
 	.clear_inode	= fuse_clear_inode,
+	.remount_fs	= fuse_remount_fs,
 	.put_super	= fuse_put_super,
 	.umount_begin	= fuse_umount_begin,
 	.statfs		= fuse_statfs,
@@ -432,8 +441,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
 				fc->async_read = 1;
-		} else
+			if (!(arg->flags & FUSE_POSIX_LOCKS))
+				fc->no_lock = 1;
+		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+			fc->no_lock = 1;
+		}
 
 		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
@@ -451,7 +464,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -467,10 +480,9 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	request_send_background(fc, req);
 }
 
-static unsigned long long conn_id(void)
+static u64 conn_id(void)
 {
-	/* BKL is held for ->get_sb() */
-	static unsigned long long ctr = 1;
+	static u64 ctr = 1;
 	return ctr++;
 }
 
@@ -484,6 +496,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	struct fuse_req *init_req;
 	int err;
 
+	if (sb->s_flags & MS_MANDLOCK)
+		return -EINVAL;
+
 	if (!parse_fuse_opt((char *) data, &d))
 		return -EINVAL;
 
@@ -527,25 +542,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!init_req)
 		goto err_put_root;
 
-	err = kobject_set_name(&fc->kobj, "%llu", conn_id());
-	if (err)
-		goto err_free_req;
-
-	err = kobject_add(&fc->kobj);
-	if (err)
-		goto err_free_req;
-
-	/* Setting file->private_data can't race with other mount()
-	   instances, since BKL is held for ->get_sb() */
+	mutex_lock(&fuse_mutex);
 	err = -EINVAL;
 	if (file->private_data)
-		goto err_kobject_del;
+		goto err_unlock;
 
+	fc->id = conn_id();
+	err = fuse_ctl_add_conn(fc);
+	if (err)
+		goto err_unlock;
+
+	list_add_tail(&fc->entry, &fuse_conn_list);
 	sb->s_root = root_dentry;
-	fc->mounted = 1;
 	fc->connected = 1;
-	kobject_get(&fc->kobj);
-	file->private_data = fc;
+	file->private_data = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
 	/*
 	 * atomic_dec_and_test() in fput() provides the necessary
 	 * memory barrier for file->private_data to be visible on all
@@ -557,23 +568,22 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 
- err_kobject_del:
-	kobject_del(&fc->kobj);
- err_free_req:
+ err_unlock:
+	mutex_unlock(&fuse_mutex);
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
  err:
 	fput(file);
-	kobject_put(&fc->kobj);
+	fuse_conn_put(fc);
 	return err;
 }
 
-static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *raw_data)
+static int fuse_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *raw_data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
 static struct file_system_type fuse_fs_type = {
@@ -583,68 +593,8 @@ static struct file_system_type fuse_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static ssize_t fuse_conn_waiting_show(struct fuse_conn *fc, char *page)
-{
-	return sprintf(page, "%i\n", atomic_read(&fc->num_waiting));
-}
-
-static ssize_t fuse_conn_abort_store(struct fuse_conn *fc, const char *page,
-				     size_t count)
-{
-	fuse_abort_conn(fc);
-	return count;
-}
-
-static struct fuse_conn_attr fuse_conn_waiting =
-	__ATTR(waiting, 0400, fuse_conn_waiting_show, NULL);
-static struct fuse_conn_attr fuse_conn_abort =
-	__ATTR(abort, 0600, NULL, fuse_conn_abort_store);
-
-static struct attribute *fuse_conn_attrs[] = {
-	&fuse_conn_waiting.attr,
-	&fuse_conn_abort.attr,
-	NULL,
-};
-
-static ssize_t fuse_conn_attr_show(struct kobject *kobj,
-				   struct attribute *attr,
-				   char *page)
-{
-	struct fuse_conn_attr *fca =
-		container_of(attr, struct fuse_conn_attr, attr);
-
-	if (fca->show)
-		return fca->show(get_fuse_conn_kobj(kobj), page);
-	else
-		return -EACCES;
-}
-
-static ssize_t fuse_conn_attr_store(struct kobject *kobj,
-				    struct attribute *attr,
-				    const char *page, size_t count)
-{
-	struct fuse_conn_attr *fca =
-		container_of(attr, struct fuse_conn_attr, attr);
-
-	if (fca->store)
-		return fca->store(get_fuse_conn_kobj(kobj), page, count);
-	else
-		return -EACCES;
-}
-
-static struct sysfs_ops fuse_conn_sysfs_ops = {
-	.show	= &fuse_conn_attr_show,
-	.store	= &fuse_conn_attr_store,
-};
-
-static struct kobj_type ktype_fuse_conn = {
-	.release	= fuse_conn_release,
-	.sysfs_ops	= &fuse_conn_sysfs_ops,
-	.default_attrs	= fuse_conn_attrs,
-};
-
 static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, &ktype_fuse_conn, NULL);
+static decl_subsys(connections, NULL, NULL);
 
 static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
 				 unsigned long flags)
@@ -718,6 +668,7 @@ static int __init fuse_init(void)
 	printk("fuse init (API version %i.%i)\n",
 	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
+	INIT_LIST_HEAD(&fuse_conn_list);
 	res = fuse_fs_init();
 	if (res)
 		goto err;
@@ -730,8 +681,14 @@ static int __init fuse_init(void)
 	if (res)
 		goto err_dev_cleanup;
 
+	res = fuse_ctl_init();
+	if (res)
+		goto err_sysfs_cleanup;
+
 	return 0;
 
+ err_sysfs_cleanup:
+	fuse_sysfs_cleanup();
  err_dev_cleanup:
 	fuse_dev_cleanup();
  err_fs_cleanup:
@@ -744,6 +701,7 @@ static void __exit fuse_exit(void)
 {
 	printk(KERN_DEBUG "fuse exit\n");
 
+	fuse_ctl_cleanup();
 	fuse_sysfs_cleanup();
 	fuse_fs_cleanup();
 	fuse_dev_cleanup();
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 1e44dcfe49c4..13231dd5ce66 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -280,7 +280,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; i++) {
-		page = read_cache_page(mapping, block++, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block++, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index d20131ce4b95..400357994319 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -59,7 +59,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	unlock_new_inode(tree->inode);
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1181d116117d..d9227bf14e86 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -80,8 +80,10 @@ static void hfs_put_super(struct super_block *sb)
  *
  * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
  */
-static int hfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
@@ -413,10 +415,11 @@ bail:
 	return res;
 }
 
-static struct super_block *hfs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int hfs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
 }
 
 static struct file_system_type hfs_fs_type = {
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 9fb51632303c..d128a25b74d2 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,8 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
-	page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	i = offset % 32;
@@ -72,8 +71,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		offset += PAGE_CACHE_BITS;
 		if (offset >= size)
 			break;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		curr = pptr = kmap(page);
 		if ((size ^ offset) / PAGE_CACHE_BITS)
 			end = pptr + PAGE_CACHE_BITS / 32;
@@ -119,8 +118,8 @@ found:
 		set_page_dirty(page);
 		kunmap(page);
 		offset += PAGE_CACHE_BITS;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
@@ -167,7 +166,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
-	page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	end = pptr + PAGE_CACHE_BITS / 32;
@@ -199,7 +198,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 			break;
 		set_page_dirty(page);
 		kunmap(page);
-		page = read_cache_page(mapping, ++pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, ++pnr, NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 746abc9ecf70..77bf434da679 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -440,7 +440,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
-		page = read_cache_page(mapping, block, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index effa8991999c..cfc852fdd1b5 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -38,7 +38,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7843f792a4b7..0a92fa2336a2 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -212,8 +212,10 @@ static void hfsplus_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
@@ -450,10 +452,12 @@ static void hfsplus_destroy_inode(struct inode *inode)
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
-static struct super_block *hfsplus_get_sb(struct file_system_type *fs_type,
-					  int flags, const char *dev_name, void *data)
+static int hfsplus_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hfsplus_fs_type = {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bf0f8e16e433..8e0d37743e7c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -239,7 +239,7 @@ static int read_inode(struct inode *ino)
 	return(err);
 }
 
-int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
+int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	/* do_statfs uses struct statfs64 internally, but the linux kernel
 	 * struct statfs still has 32-bit versions for most of these fields,
@@ -252,7 +252,7 @@ int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
 	long long f_files;
 	long long f_ffree;
 
-	err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
+	err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
 			&sf->f_namelen, sf->f_spare);
@@ -993,11 +993,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hostfs_read_sb(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hostfs_read_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
+	return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
 }
 
 static struct file_system_type hostfs_type = {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index d72d8c87c996..f798480a363f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -135,8 +135,9 @@ static unsigned count_bitmaps(struct super_block *s)
 	return count;
 }
 
-static int hpfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	lock_kernel();
 
@@ -662,10 +663,11 @@ bail0:
 	return -EINVAL;
 }
 
-static struct super_block *hpfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hpfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hpfs_fs_type = {
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 5e6363be246f..3a9bdf58166f 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -616,7 +616,7 @@ static const struct file_operations hppfs_dir_fops = {
 	.fsync		= hppfs_fsync,
 };
 
-static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	sf->f_blocks = 0;
 	sf->f_bfree = 0;
@@ -769,11 +769,11 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hppfs_read_super(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hppfs_fill_super));
+	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
 }
 
 static struct file_system_type hppfs_type = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a5b4e923455..e6410d8edd0e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	if (vma->vm_flags & VM_MAYSHARE)
-		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
-			goto out;
+	if (vma->vm_flags & VM_MAYSHARE &&
+	    hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+				  len >> HPAGE_SHIFT))
+		goto out;
 
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
-	int i;
+	int i, freed = 0;
 
-	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
-				     lstart >> HPAGE_SHIFT);
-	if (!mapping->nrpages)
-		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 			truncate_huge_page(page);
 			unlock_page(page);
 			hugetlb_put_quota(mapping);
+			freed++;
 		}
 		huge_pagevec_release(&pvec);
 	}
 	BUG_ON(!lstart && mapping->nrpages);
+	hugetlb_unreserve_pages(inode, start, freed);
 }
 
 static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
 		switch (mode & S_IFMT) {
@@ -467,9 +466,9 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
 
 	buf->f_type = HUGETLBFS_MAGIC;
 	buf->f_bsize = HPAGE_SIZE;
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
-	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -723,10 +721,10 @@ void hugetlb_put_quota(struct address_space *mapping)
 	}
 }
 
-static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
 		goto out_file;
 
 	error = -ENOMEM;
-	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
-				       size >> HPAGE_SHIFT) != 0)
+	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/fs/inotify.c b/fs/inotify.c
index 732ec4bd5774..723836a1f718 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -5,7 +5,10 @@
  *	John McCutchan	<ttb@tentacle.dhs.org>
  *	Robert Love	<rml@novell.com>
  *
+ * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
+ *
  * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -20,35 +23,17 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
-#include <linux/syscalls.h>
-
-#include <asm/ioctls.h>
 
 static atomic_t inotify_cookie;
 
-static kmem_cache_t *watch_cachep __read_mostly;
-static kmem_cache_t *event_cachep __read_mostly;
-
-static struct vfsmount *inotify_mnt __read_mostly;
-
-/* these are configurable via /proc/sys/fs/inotify/ */
-int inotify_max_user_instances __read_mostly;
-int inotify_max_user_watches __read_mostly;
-int inotify_max_queued_events __read_mostly;
-
 /*
  * Lock ordering:
  *
@@ -56,327 +41,108 @@ int inotify_max_queued_events __read_mostly;
  * iprune_mutex (synchronize shrink_icache_memory())
  * 	inode_lock (protects the super_block->s_inodes list)
  * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- * 		inotify_dev->mutex (protects inotify_device and watches->d_list)
+ * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
+ *
+ * The inode->inotify_mutex and inotify_handle->mutex and held during execution
+ * of a caller's event handler.  Thus, the caller must not hold any locks
+ * taken in their event handler while calling any of the published inotify
+ * interfaces.
  */
 
 /*
- * Lifetimes of the three main data structures--inotify_device, inode, and
+ * Lifetimes of the three main data structures--inotify_handle, inode, and
  * inotify_watch--are managed by reference count.
  *
- * inotify_device: Lifetime is from inotify_init() until release.  Additional
- * references can bump the count via get_inotify_dev() and drop the count via
- * put_inotify_dev().
+ * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
+ * Additional references can bump the count via get_inotify_handle() and drop
+ * the count via put_inotify_handle().
  *
- * inotify_watch: Lifetime is from create_watch() to destory_watch().
- * Additional references can bump the count via get_inotify_watch() and drop
- * the count via put_inotify_watch().
+ * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
+ * to remove_watch_no_event().  Additional references can bump the count via
+ * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
+ * is reponsible for the final put after receiving IN_IGNORED, or when using
+ * IN_ONESHOT after receiving the first event.  Inotify does the final put if
+ * inotify_destroy() is called.
  *
  * inode: Pinned so long as the inode is associated with a watch, from
- * create_watch() to put_inotify_watch().
+ * inotify_add_watch() to the final put_inotify_watch().
  */
 
 /*
- * struct inotify_device - represents an inotify instance
+ * struct inotify_handle - represents an inotify instance
  *
  * This structure is protected by the mutex 'mutex'.
  */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
+struct inotify_handle {
 	struct idr		idr;		/* idr mapping wd -> watch */
 	struct mutex		mutex;		/* protects this bad boy */
-	struct list_head 	events;		/* list of queued events */
 	struct list_head	watches;	/* list of watches */
 	atomic_t		count;		/* reference count */
-	struct user_struct	*user;		/* user who opened this dev */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
 	u32			last_wd;	/* the last wd allocated */
+	const struct inotify_operations *in_ops; /* inotify caller operations */
 };
 
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_watch - represents a watch request on a specific inode
- *
- * d_list is protected by dev->mutex of the associated watch->dev.
- * i_list and mask are protected by inode->inotify_mutex of the associated inode.
- * dev, inode, and wd are never written to once the watch is created.
- */
-struct inotify_watch {
-	struct list_head	d_list;	/* entry in inotify_device's list */
-	struct list_head	i_list;	/* entry in inode's list */
-	atomic_t		count;	/* reference count */
-	struct inotify_device	*dev;	/* associated device */
-	struct inode		*inode;	/* associated inode */
-	s32 			wd;	/* watch descriptor */
-	u32			mask;	/* event mask for this watch */
-};
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/sysctl.h>
-
-static int zero;
-
-ctl_table inotify_table[] = {
-	{
-		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
-		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
-		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero, 
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
-		.procname	= "max_queued_events",
-		.data		= &inotify_max_queued_events,
-		.maxlen		= sizeof(int),
-		.mode		= 0644, 
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec, 
-		.extra1		= &zero
-	},
-	{ .ctl_name = 0 }
-};
-#endif /* CONFIG_SYSCTL */
-
-static inline void get_inotify_dev(struct inotify_device *dev)
+static inline void get_inotify_handle(struct inotify_handle *ih)
 {
-	atomic_inc(&dev->count);
+	atomic_inc(&ih->count);
 }
 
-static inline void put_inotify_dev(struct inotify_device *dev)
+static inline void put_inotify_handle(struct inotify_handle *ih)
 {
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		idr_destroy(&dev->idr);
-		kfree(dev);
+	if (atomic_dec_and_test(&ih->count)) {
+		idr_destroy(&ih->idr);
+		kfree(ih);
 	}
 }
 
-static inline void get_inotify_watch(struct inotify_watch *watch)
+/**
+ * get_inotify_watch - grab a reference to an inotify_watch
+ * @watch: watch to grab
+ */
+void get_inotify_watch(struct inotify_watch *watch)
 {
 	atomic_inc(&watch->count);
 }
+EXPORT_SYMBOL_GPL(get_inotify_watch);
 
-/*
+/**
  * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * the watch and its references if the count reaches zero.
+ * watch references if the count reaches zero.  inotify_watch is freed by
+ * inotify callers via the destroy_watch() op.
+ * @watch: watch to release
  */
-static inline void put_inotify_watch(struct inotify_watch *watch)
+void put_inotify_watch(struct inotify_watch *watch)
 {
 	if (atomic_dec_and_test(&watch->count)) {
-		put_inotify_dev(watch->dev);
-		iput(watch->inode);
-		kmem_cache_free(watch_cachep, watch);
-	}
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_KERNEL);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);		
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_queue_event - add a new event to the given device
- *
- * Caller must hold dev->mutex.  Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_device *dev,
-				    struct inotify_watch *watch, u32 mask,
-				    u32 cookie, const char *name)
-{
-	struct inotify_kernel_event *kevent, *last;
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == watch->wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			return;
-		if (name && lastname && !strcmp(lastname, name))
-			return;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		return;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(watch->wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		return;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-}
-
-/*
- * remove_kevent - cleans up and ultimately frees the given kevent
- *
- * Caller must hold dev->mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
-
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
+		struct inotify_handle *ih = watch->ih;
 
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
+		iput(watch->inode);
+		ih->in_ops->destroy_watch(watch);
+		put_inotify_handle(ih);
 	}
 }
+EXPORT_SYMBOL_GPL(put_inotify_watch);
 
 /*
- * inotify_dev_get_wd - returns the next WD for use by the given dev
+ * inotify_handle_get_wd - returns the next WD for use by the given handle
  *
- * Callers must hold dev->mutex.  This function can sleep.
+ * Callers must hold ih->mutex.  This function can sleep.
  */
-static int inotify_dev_get_wd(struct inotify_device *dev,
-			      struct inotify_watch *watch)
+static int inotify_handle_get_wd(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
 {
 	int ret;
 
 	do {
-		if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL)))
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
 			return -ENOSPC;
-		ret = idr_get_new_above(&dev->idr, watch, dev->last_wd+1, &watch->wd);
+		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
 	} while (ret == -EAGAIN);
 
-	return ret;
-}
+	if (likely(!ret))
+		ih->last_wd = watch->wd;
 
-/*
- * find_inode - resolve a user-given path to a specific inode and return a nd
- */
-static int find_inode(const char __user *dirname, struct nameidata *nd,
-		      unsigned flags)
-{
-	int error;
-
-	error = __user_walk(dirname, flags, nd);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = vfs_permission(nd, MAY_READ);
-	if (error) 
-		path_release(nd);
-	return error;
+	return ret;
 }
 
 /*
@@ -422,67 +188,18 @@ static void set_dentry_child_flags(struct inode *inode, int watched)
 }
 
 /*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->mutex.  Calls inotify_dev_get_wd() so may sleep.
- * Both 'dev' and 'inode' (by way of nameidata) need to be pinned.
- */
-static struct inotify_watch *create_watch(struct inotify_device *dev,
-					  u32 mask, struct inode *inode)
-{
-	struct inotify_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return ERR_PTR(-ENOSPC);
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return ERR_PTR(-ENOMEM);
-
-	ret = inotify_dev_get_wd(dev, watch);
-	if (unlikely(ret)) {
-		kmem_cache_free(watch_cachep, watch);
-		return ERR_PTR(ret);
-	}
-
-	dev->last_wd = watch->wd;
-	watch->mask = mask;
-	atomic_set(&watch->count, 0);
-	INIT_LIST_HEAD(&watch->d_list);
-	INIT_LIST_HEAD(&watch->i_list);
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	/*
-	 * Save a reference to the inode and bump the ref count to make it
-	 * official.  We hold a reference to nameidata, which makes this safe.
-	 */
-	watch->inode = igrab(inode);
-
-	/* bump our own count, corresponding to our entry in dev->watches */
-	get_inotify_watch(watch);
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	return watch;
-}
-
-/*
- * inotify_find_dev - find the watch associated with the given inode and dev
+ * inotify_find_handle - find the watch associated with the given inode and
+ * handle
  *
  * Callers must hold inode->inotify_mutex.
  */
-static struct inotify_watch *inode_find_dev(struct inode *inode,
-					    struct inotify_device *dev)
+static struct inotify_watch *inode_find_handle(struct inode *inode,
+					       struct inotify_handle *ih)
 {
 	struct inotify_watch *watch;
 
 	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-		if (watch->dev == dev)
+		if (watch->ih == ih)
 			return watch;
 	}
 
@@ -490,40 +207,40 @@ static struct inotify_watch *inode_find_dev(struct inode *inode,
 }
 
 /*
- * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
+ * remove_watch_no_event - remove watch without the IN_IGNORED event.
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
 static void remove_watch_no_event(struct inotify_watch *watch,
-				  struct inotify_device *dev)
+				  struct inotify_handle *ih)
 {
 	list_del(&watch->i_list);
-	list_del(&watch->d_list);
+	list_del(&watch->h_list);
 
 	if (!inotify_inode_watched(watch->inode))
 		set_dentry_child_flags(watch->inode, 0);
 
-	atomic_dec(&dev->user->inotify_watches);
-	idr_remove(&dev->idr, watch->wd);
-	put_inotify_watch(watch);
+	idr_remove(&ih->idr, watch->wd);
 }
 
-/*
- * remove_watch - Remove a watch from both the device and the inode.  Sends
- * the IN_IGNORED event to the given device signifying that the inode is no
- * longer watched.
- *
- * Callers must hold both inode->inotify_mutex and dev->mutex.  We drop a
- * reference to the inode before returning.
+/**
+ * inotify_remove_watch_locked - Remove a watch from both the handle and the
+ * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
+ * watched.  May be invoked from a caller's event handler.
+ * @ih: inotify handle associated with watch
+ * @watch: watch to remove
  *
- * The inode is not iput() so as to remain atomic.  If the inode needs to be
- * iput(), the call returns one.  Otherwise, it returns zero.
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
-static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev)
+void inotify_remove_watch_locked(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
 {
-	inotify_dev_queue_event(dev, watch, IN_IGNORED, 0, NULL);
-	remove_watch_no_event(watch, dev);
+	remove_watch_no_event(watch, ih);
+	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
 
-/* Kernel API */
+/* Kernel API for producing events */
 
 /*
  * inotify_d_instantiate - instantiate dcache entry for inode
@@ -563,9 +280,10 @@ void inotify_d_move(struct dentry *entry)
  * @mask: event mask describing this event
  * @cookie: cookie for synchronization, or zero
  * @name: filename, if any
+ * @n_inode: inode associated with name
  */
 void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-			       const char *name)
+			       const char *name, struct inode *n_inode)
 {
 	struct inotify_watch *watch, *next;
 
@@ -576,14 +294,13 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
 		u32 watch_mask = watch->mask;
 		if (watch_mask & mask) {
-			struct inotify_device *dev = watch->dev;
-			get_inotify_watch(watch);
-			mutex_lock(&dev->mutex);
-			inotify_dev_queue_event(dev, watch, mask, cookie, name);
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
 			if (watch_mask & IN_ONESHOT)
-				remove_watch_no_event(watch, dev);
-			mutex_unlock(&dev->mutex);
-			put_inotify_watch(watch);
+				remove_watch_no_event(watch, ih);
+			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
+						 name, n_inode);
+			mutex_unlock(&ih->mutex);
 		}
 	}
 	mutex_unlock(&inode->inotify_mutex);
@@ -613,7 +330,8 @@ void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
 	if (inotify_inode_watched(inode)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
-		inotify_inode_queue_event(inode, mask, cookie, name);
+		inotify_inode_queue_event(inode, mask, cookie, name,
+					  dentry->d_inode);
 		dput(parent);
 	} else
 		spin_unlock(&dentry->d_lock);
@@ -665,7 +383,7 @@ void inotify_unmount_inodes(struct list_head *list)
 
 		need_iput_tmp = need_iput;
 		need_iput = NULL;
-		/* In case the remove_watch() drops a reference. */
+		/* In case inotify_remove_watch_locked() drops a reference. */
 		if (inode != need_iput_tmp)
 			__iget(inode);
 		else
@@ -694,11 +412,12 @@ void inotify_unmount_inodes(struct list_head *list)
 		mutex_lock(&inode->inotify_mutex);
 		watches = &inode->inotify_watches;
 		list_for_each_entry_safe(watch, next_w, watches, i_list) {
-			struct inotify_device *dev = watch->dev;
-			mutex_lock(&dev->mutex);
-			inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL);
-			remove_watch(watch, dev);
-			mutex_unlock(&dev->mutex);
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
+			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
+						 NULL, NULL);
+			inotify_remove_watch_locked(ih, watch);
+			mutex_unlock(&ih->mutex);
 		}
 		mutex_unlock(&inode->inotify_mutex);
 		iput(inode);		
@@ -718,432 +437,292 @@ void inotify_inode_is_dead(struct inode *inode)
 
 	mutex_lock(&inode->inotify_mutex);
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		struct inotify_device *dev = watch->dev;
-		mutex_lock(&dev->mutex);
-		remove_watch(watch, dev);
-		mutex_unlock(&dev->mutex);
+		struct inotify_handle *ih = watch->ih;
+		mutex_lock(&ih->mutex);
+		inotify_remove_watch_locked(ih, watch);
+		mutex_unlock(&ih->mutex);
 	}
 	mutex_unlock(&inode->inotify_mutex);
 }
 EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
 
-/* Device Interface */
+/* Kernel Consumer API */
 
-static unsigned int inotify_poll(struct file *file, poll_table *wait)
+/**
+ * inotify_init - allocate and initialize an inotify instance
+ * @ops: caller's inotify operations
+ */
+struct inotify_handle *inotify_init(const struct inotify_operations *ops)
 {
-	struct inotify_device *dev = file->private_data;
-	int ret = 0;
+	struct inotify_handle *ih;
 
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->mutex);
-	if (!list_empty(&dev->events))
-		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->mutex);
+	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
+	if (unlikely(!ih))
+		return ERR_PTR(-ENOMEM);
 
-	return ret;
+	idr_init(&ih->idr);
+	INIT_LIST_HEAD(&ih->watches);
+	mutex_init(&ih->mutex);
+	ih->last_wd = 0;
+	ih->in_ops = ops;
+	atomic_set(&ih->count, 0);
+	get_inotify_handle(ih);
+
+	return ih;
 }
+EXPORT_SYMBOL_GPL(inotify_init);
 
-static ssize_t inotify_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *pos)
+/**
+ * inotify_init_watch - initialize an inotify watch
+ * @watch: watch to initialize
+ */
+void inotify_init_watch(struct inotify_watch *watch)
 {
-	size_t event_size = sizeof (struct inotify_event);
-	struct inotify_device *dev;
-	char __user *start;
-	int ret;
-	DEFINE_WAIT(wait);
-
-	start = buf;
-	dev = file->private_data;
-
-	while (1) {
-		int events;
-
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
-
-		mutex_lock(&dev->mutex);
-		events = !list_empty(&dev->events);
-		mutex_unlock(&dev->mutex);
-		if (events) {
-			ret = 0;
-			break;
-		}
-
-		if (file->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-
-		schedule();
-	}
-
-	finish_wait(&dev->wq, &wait);
-	if (ret)
-		return ret;
-
-	mutex_lock(&dev->mutex);
-	while (1) {
-		struct inotify_kernel_event *kevent;
-
-		ret = buf - start;
-		if (list_empty(&dev->events))
-			break;
-
-		kevent = inotify_dev_get_event(dev);
-		if (event_size + kevent->event.len > count)
-			break;
-
-		if (copy_to_user(buf, &kevent->event, event_size)) {
-			ret = -EFAULT;
-			break;
-		}
-		buf += event_size;
-		count -= event_size;
-
-		if (kevent->name) {
-			if (copy_to_user(buf, kevent->name, kevent->event.len)){
-				ret = -EFAULT;
-				break;
-			}
-			buf += kevent->event.len;
-			count -= kevent->event.len;
-		}
-
-		remove_kevent(dev, kevent);
-	}
-	mutex_unlock(&dev->mutex);
-
-	return ret;
+	INIT_LIST_HEAD(&watch->h_list);
+	INIT_LIST_HEAD(&watch->i_list);
+	atomic_set(&watch->count, 0);
+	get_inotify_watch(watch); /* initial get */
 }
+EXPORT_SYMBOL_GPL(inotify_init_watch);
 
-static int inotify_release(struct inode *ignored, struct file *file)
+/**
+ * inotify_destroy - clean up and destroy an inotify instance
+ * @ih: inotify handle
+ */
+void inotify_destroy(struct inotify_handle *ih)
 {
-	struct inotify_device *dev = file->private_data;
-
 	/*
-	 * Destroy all of the watches on this device.  Unfortunately, not very
+	 * Destroy all of the watches for this handle. Unfortunately, not very
 	 * pretty.  We cannot do a simple iteration over the list, because we
 	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_mutex before dev->mutex.  The following works.
+	 * hold inode->inotify_mutex before ih->mutex.  The following works.
 	 */
 	while (1) {
 		struct inotify_watch *watch;
 		struct list_head *watches;
 		struct inode *inode;
 
-		mutex_lock(&dev->mutex);
-		watches = &dev->watches;
+		mutex_lock(&ih->mutex);
+		watches = &ih->watches;
 		if (list_empty(watches)) {
-			mutex_unlock(&dev->mutex);
+			mutex_unlock(&ih->mutex);
 			break;
 		}
-		watch = list_entry(watches->next, struct inotify_watch, d_list);
+		watch = list_entry(watches->next, struct inotify_watch, h_list);
 		get_inotify_watch(watch);
-		mutex_unlock(&dev->mutex);
+		mutex_unlock(&ih->mutex);
 
 		inode = watch->inode;
 		mutex_lock(&inode->inotify_mutex);
-		mutex_lock(&dev->mutex);
+		mutex_lock(&ih->mutex);
 
 		/* make sure we didn't race with another list removal */
-		if (likely(idr_find(&dev->idr, watch->wd)))
-			remove_watch_no_event(watch, dev);
+		if (likely(idr_find(&ih->idr, watch->wd))) {
+			remove_watch_no_event(watch, ih);
+			put_inotify_watch(watch);
+		}
 
-		mutex_unlock(&dev->mutex);
+		mutex_unlock(&ih->mutex);
 		mutex_unlock(&inode->inotify_mutex);
 		put_inotify_watch(watch);
 	}
 
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->mutex);
-
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
-
-	return 0;
+	/* free this handle: the put matching the get in inotify_init() */
+	put_inotify_handle(ih);
 }
+EXPORT_SYMBOL_GPL(inotify_destroy);
 
-/*
- * inotify_ignore - remove a given wd from this inotify instance.
+/**
+ * inotify_find_watch - find an existing watch for an (ih,inode) pair
+ * @ih: inotify handle
+ * @inode: inode to watch
+ * @watchp: pointer to existing inotify_watch
  *
- * Can sleep.
+ * Caller must pin given inode (via nameidata).
  */
-static int inotify_ignore(struct inotify_device *dev, s32 wd)
+s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+		       struct inotify_watch **watchp)
 {
-	struct inotify_watch *watch;
-	struct inode *inode;
-
-	mutex_lock(&dev->mutex);
-	watch = idr_find(&dev->idr, wd);
-	if (unlikely(!watch)) {
-		mutex_unlock(&dev->mutex);
-		return -EINVAL;
-	}
-	get_inotify_watch(watch);
-	inode = watch->inode;
-	mutex_unlock(&dev->mutex);
+	struct inotify_watch *old;
+	int ret = -ENOENT;
 
 	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&dev->mutex);
+	mutex_lock(&ih->mutex);
 
-	/* make sure that we did not race */
-	if (likely(idr_find(&dev->idr, wd) == watch))
-		remove_watch(watch, dev);
+	old = inode_find_handle(inode, ih);
+	if (unlikely(old)) {
+		get_inotify_watch(old); /* caller must put watch */
+		*watchp = old;
+		ret = old->wd;
+	}
 
-	mutex_unlock(&dev->mutex);
+	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(watch);
 
-	return 0;
+	return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_find_watch);
 
-static long inotify_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
+/**
+ * inotify_find_update_watch - find and update the mask of an existing watch
+ * @ih: inotify handle
+ * @inode: inode's watch to update
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
+			      u32 mask)
 {
-	struct inotify_device *dev;
-	void __user *p;
-	int ret = -ENOTTY;
-
-	dev = file->private_data;
-	p = (void __user *) arg;
-
-	switch (cmd) {
-	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
-		break;
-	}
-
-	return ret;
-}
+	struct inotify_watch *old;
+	int mask_add = 0;
+	int ret;
 
-static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
-	.compat_ioctl	= inotify_ioctl,
-};
+	if (mask & IN_MASK_ADD)
+		mask_add = 1;
 
-asmlinkage long sys_inotify_init(void)
-{
-	struct inotify_device *dev;
-	struct user_struct *user;
-	struct file *filp;	
-	int fd, ret;
-
-	fd = get_unused_fd();
-	if (fd < 0)
-		return fd;
-
-	filp = get_empty_filp();
-	if (!filp) {
-		ret = -ENFILE;
-		goto out_put_fd;
-	}
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
 
-	user = get_uid(current->user);
-	if (unlikely(atomic_read(&user->inotify_devs) >=
-			inotify_max_user_instances)) {
-		ret = -EMFILE;
-		goto out_free_uid;
-	}
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
 
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
-		goto out_free_uid;
+	/*
+	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
+	 * are already watching.  We just update the mask and return its wd.
+	 */
+	old = inode_find_handle(inode, ih);
+	if (unlikely(!old)) {
+		ret = -ENOENT;
+		goto out;
 	}
 
-	filp->f_op = &inotify_fops;
-	filp->f_vfsmnt = mntget(inotify_mnt);
-	filp->f_dentry = dget(inotify_mnt->mnt_root);
-	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
-	filp->f_mode = FMODE_READ;
-	filp->f_flags = O_RDONLY;
-	filp->private_data = dev;
-
-	idr_init(&dev->idr);
-	INIT_LIST_HEAD(&dev->events);
-	INIT_LIST_HEAD(&dev->watches);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	dev->last_wd = 0;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
-	atomic_inc(&user->inotify_devs);
-	fd_install(fd, filp);
-
-	return fd;
-out_free_uid:
-	free_uid(user);
-	put_filp(filp);
-out_put_fd:
-	put_unused_fd(fd);
+	if (mask_add)
+		old->mask |= mask;
+	else
+		old->mask = mask;
+	ret = old->wd;
+out:
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_find_update_watch);
 
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+/**
+ * inotify_add_watch - add a watch to an inotify instance
+ * @ih: inotify handle
+ * @watch: caller allocated watch structure
+ * @inode: inode to watch
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ * Caller must ensure it only calls inotify_add_watch() once per watch.
+ * Calls inotify_handle_get_wd() so may sleep.
+ */
+s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+		      struct inode *inode, u32 mask)
 {
-	struct inotify_watch *watch, *old;
-	struct inode *inode;
-	struct inotify_device *dev;
-	struct nameidata nd;
-	struct file *filp;
-	int ret, fput_needed;
-	int mask_add = 0;
-	unsigned flags = 0;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
-
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto fput_and_out;
-	}
-
-	if (!(mask & IN_DONT_FOLLOW))
-		flags |= LOOKUP_FOLLOW;
-	if (mask & IN_ONLYDIR)
-		flags |= LOOKUP_DIRECTORY;
-
-	ret = find_inode(path, &nd, flags);
-	if (unlikely(ret))
-		goto fput_and_out;
+	int ret = 0;
 
-	/* inode held in place by reference to nd; dev by fget on fd */
-	inode = nd.dentry->d_inode;
-	dev = filp->private_data;
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
+	watch->mask = mask;
 
 	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&dev->mutex);
-
-	if (mask & IN_MASK_ADD)
-		mask_add = 1;
+	mutex_lock(&ih->mutex);
 
-	/* don't let user-space set invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask)) {
-		ret = -EINVAL;
+	/* Initialize a new watch */
+	ret = inotify_handle_get_wd(ih, watch);
+	if (unlikely(ret))
 		goto out;
-	}
+	ret = watch->wd;
+
+	/* save a reference to handle and bump the count to make it official */
+	get_inotify_handle(ih);
+	watch->ih = ih;
 
 	/*
-	 * Handle the case of re-adding a watch on an (inode,dev) pair that we
-	 * are already watching.  We just update the mask and return its wd.
+	 * Save a reference to the inode and bump the ref count to make it
+	 * official.  We hold a reference to nameidata, which makes this safe.
 	 */
-	old = inode_find_dev(inode, dev);
-	if (unlikely(old)) {
-		if (mask_add)
-			old->mask |= mask;
-		else
-			old->mask = mask;
-		ret = old->wd;
-		goto out;
-	}
-
-	watch = create_watch(dev, mask, inode);
-	if (unlikely(IS_ERR(watch))) {
-		ret = PTR_ERR(watch);
-		goto out;
-	}
+	watch->inode = igrab(inode);
 
 	if (!inotify_inode_watched(inode))
 		set_dentry_child_flags(inode, 1);
 
-	/* Add the watch to the device's and the inode's list */
-	list_add(&watch->d_list, &dev->watches);
+	/* Add the watch to the handle's and the inode's list */
+	list_add(&watch->h_list, &ih->watches);
 	list_add(&watch->i_list, &inode->inotify_watches);
-	ret = watch->wd;
 out:
-	mutex_unlock(&dev->mutex);
+	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	path_release(&nd);
-fput_and_out:
-	fput_light(filp, fput_needed);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_add_watch);
 
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+/**
+ * inotify_rm_wd - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @wd: watch descriptor to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
-	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
+	struct inotify_watch *watch;
+	struct inode *inode;
 
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto out;
+	mutex_lock(&ih->mutex);
+	watch = idr_find(&ih->idr, wd);
+	if (unlikely(!watch)) {
+		mutex_unlock(&ih->mutex);
+		return -EINVAL;
 	}
+	get_inotify_watch(watch);
+	inode = watch->inode;
+	mutex_unlock(&ih->mutex);
 
-	dev = filp->private_data;
-	ret = inotify_ignore(dev, wd);
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
 
-out:
-	fput_light(filp, fput_needed);
-	return ret;
+	/* make sure that we did not race */
+	if (likely(idr_find(&ih->idr, wd) == watch))
+		inotify_remove_watch_locked(ih, watch);
+
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	put_inotify_watch(watch);
+
+	return 0;
 }
+EXPORT_SYMBOL_GPL(inotify_rm_wd);
 
-static struct super_block *
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data)
+/**
+ * inotify_rm_watch - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @watch: watch to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_watch(struct inotify_handle *ih,
+		     struct inotify_watch *watch)
 {
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+	return inotify_rm_wd(ih, watch->wd);
 }
-
-static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
-};
+EXPORT_SYMBOL_GPL(inotify_rm_watch);
 
 /*
- * inotify_setup - Our initialization function.  Note that we cannnot return
- * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
- * must result in panic().
+ * inotify_setup - core initialization function
  */
 static int __init inotify_setup(void)
 {
-	int ret;
-
-	ret = register_filesystem(&inotify_fs_type);
-	if (unlikely(ret))
-		panic("inotify: register_filesystem returned %d!\n", ret);
-
-	inotify_mnt = kern_mount(&inotify_fs_type);
-	if (IS_ERR(inotify_mnt))
-		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-
-	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
-
 	atomic_set(&inotify_cookie, 0);
 
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_watch),
-					 0, SLAB_PANIC, NULL, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL, NULL);
-
 	return 0;
 }
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
new file mode 100644
index 000000000000..f2386442adee
--- /dev/null
+++ b/fs/inotify_user.c
@@ -0,0 +1,719 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/inotify.h>
+#include <linux/syscalls.h>
+
+#include <asm/ioctls.h>
+
+static kmem_cache_t *watch_cachep __read_mostly;
+static kmem_cache_t *event_cachep __read_mostly;
+
+static struct vfsmount *inotify_mnt __read_mostly;
+
+/* these are configurable via /proc/sys/fs/inotify/ */
+int inotify_max_user_instances __read_mostly;
+int inotify_max_user_watches __read_mostly;
+int inotify_max_queued_events __read_mostly;
+
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ * 	inode->inotify_mutex (protects inode's watch list)
+ * 		inotify_handle->mutex (protects inotify_handle's watch list)
+ * 			inotify_dev->ev_mutex (protects device's event queue)
+ */
+
+/*
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
+ */
+
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+	wait_queue_head_t 	wq;		/* wait queue for i/o */
+	struct mutex		ev_mutex;	/* protects event queue */
+	struct mutex		up_mutex;	/* synchronizes watch updates */
+	struct list_head 	events;		/* list of queued events */
+	atomic_t		count;		/* reference count */
+	struct user_struct	*user;		/* user who opened this dev */
+	struct inotify_handle	*ih;		/* inotify handle */
+	unsigned int		queue_size;	/* size of the queue (bytes) */
+	unsigned int		event_count;	/* number of pending events */
+	unsigned int		max_events;	/* maximum number of events */
+};
+
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+	struct inotify_event	event;	/* the user-space event */
+	struct list_head        list;	/* entry in inotify_device's list */
+	char			*name;	/* filename, if any */
+};
+
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+	struct inotify_device	*dev;	/* associated device */
+	struct inotify_watch	wdata;	/* inotify watch data */
+};
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table inotify_table[] = {
+	{
+		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+	atomic_inc(&dev->count);
+}
+
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+	if (atomic_dec_and_test(&dev->count)) {
+		atomic_dec(&dev->user->inotify_devs);
+		free_uid(dev->user);
+		kfree(dev);
+	}
+}
+
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	atomic_dec(&dev->user->inotify_watches);
+	put_inotify_dev(dev);
+	kmem_cache_free(watch_cachep, watch);
+}
+
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+						  const char *name)
+{
+	struct inotify_kernel_event *kevent;
+
+	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
+	if (unlikely(!kevent))
+		return NULL;
+
+	/* we hand this out to user-space, so zero it just in case */
+	memset(&kevent->event, 0, sizeof(struct inotify_event));
+
+	kevent->event.wd = wd;
+	kevent->event.mask = mask;
+	kevent->event.cookie = cookie;
+
+	INIT_LIST_HEAD(&kevent->list);
+
+	if (name) {
+		size_t len, rem, event_size = sizeof(struct inotify_event);
+
+		/*
+		 * We need to pad the filename so as to properly align an
+		 * array of inotify_event structures.  Because the structure is
+		 * small and the common case is a small filename, we just round
+		 * up to the next multiple of the structure's sizeof.  This is
+		 * simple and safe for all architectures.
+		 */
+		len = strlen(name) + 1;
+		rem = event_size - len;
+		if (len > event_size) {
+			rem = event_size - (len % event_size);
+			if (len % event_size == 0)
+				rem = 0;
+		}
+
+		kevent->name = kmalloc(len + rem, GFP_KERNEL);
+		if (unlikely(!kevent->name)) {
+			kmem_cache_free(event_cachep, kevent);
+			return NULL;
+		}
+		memcpy(kevent->name, name, len);
+		if (rem)
+			memset(kevent->name + len, 0, rem);
+		kevent->event.len = len + rem;
+	} else {
+		kevent->event.len = 0;
+		kevent->name = NULL;
+	}
+
+	return kevent;
+}
+
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+	return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+				    u32 cookie, const char *name,
+				    struct inode *ignored)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+	struct inotify_kernel_event *kevent, *last;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	mutex_lock(&dev->ev_mutex);
+
+	/* we can safely put the watch as we don't reference it while
+	 * generating the event
+	 */
+	if (mask & IN_IGNORED || mask & IN_ONESHOT)
+		put_inotify_watch(w); /* final put */
+
+	/* coalescing: drop this event if it is a dupe of the previous */
+	last = inotify_dev_get_event(dev);
+	if (last && last->event.mask == mask && last->event.wd == wd &&
+			last->event.cookie == cookie) {
+		const char *lastname = last->name;
+
+		if (!name && !lastname)
+			goto out;
+		if (name && lastname && !strcmp(lastname, name))
+			goto out;
+	}
+
+	/* the queue overflowed and we already sent the Q_OVERFLOW event */
+	if (unlikely(dev->event_count > dev->max_events))
+		goto out;
+
+	/* if the queue overflows, we need to notify user space */
+	if (unlikely(dev->event_count == dev->max_events))
+		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+	else
+		kevent = kernel_event(wd, mask, cookie, name);
+
+	if (unlikely(!kevent))
+		goto out;
+
+	/* queue the event and wake up anyone waiting */
+	dev->event_count++;
+	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+	list_add_tail(&kevent->list, &dev->events);
+	wake_up_interruptible(&dev->wq);
+
+out:
+	mutex_unlock(&dev->ev_mutex);
+}
+
+/*
+ * remove_kevent - cleans up and ultimately frees the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+			  struct inotify_kernel_event *kevent)
+{
+	list_del(&kevent->list);
+
+	dev->event_count--;
+	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+
+	kfree(kevent->name);
+	kmem_cache_free(event_cachep, kevent);
+}
+
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+	if (!list_empty(&dev->events)) {
+		struct inotify_kernel_event *kevent;
+		kevent = inotify_dev_get_event(dev);
+		remove_kevent(dev, kevent);
+	}
+}
+
+/*
+ * find_inode - resolve a user-given path to a specific inode and return a nd
+ */
+static int find_inode(const char __user *dirname, struct nameidata *nd,
+		      unsigned flags)
+{
+	int error;
+
+	error = __user_walk(dirname, flags, nd);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = vfs_permission(nd, MAY_READ);
+	if (error)
+		path_release(nd);
+	return error;
+}
+
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+			u32 mask)
+{
+	struct inotify_user_watch *watch;
+	int ret;
+
+	if (atomic_read(&dev->user->inotify_watches) >=
+			inotify_max_user_watches)
+		return -ENOSPC;
+
+	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+	if (unlikely(!watch))
+		return -ENOMEM;
+
+	/* save a reference to device and bump the count to make it official */
+	get_inotify_dev(dev);
+	watch->dev = dev;
+
+	atomic_inc(&dev->user->inotify_watches);
+
+	inotify_init_watch(&watch->wdata);
+	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+	if (ret < 0)
+		free_inotify_user_watch(&watch->wdata);
+
+	return ret;
+}
+
+/* Device Interface */
+
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct inotify_device *dev = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &dev->wq, wait);
+	mutex_lock(&dev->ev_mutex);
+	if (!list_empty(&dev->events))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	size_t event_size = sizeof (struct inotify_event);
+	struct inotify_device *dev;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	dev = file->private_data;
+
+	while (1) {
+		int events;
+
+		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&dev->ev_mutex);
+		events = !list_empty(&dev->events);
+		mutex_unlock(&dev->ev_mutex);
+		if (events) {
+			ret = 0;
+			break;
+		}
+
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		schedule();
+	}
+
+	finish_wait(&dev->wq, &wait);
+	if (ret)
+		return ret;
+
+	mutex_lock(&dev->ev_mutex);
+	while (1) {
+		struct inotify_kernel_event *kevent;
+
+		ret = buf - start;
+		if (list_empty(&dev->events))
+			break;
+
+		kevent = inotify_dev_get_event(dev);
+		if (event_size + kevent->event.len > count)
+			break;
+
+		if (copy_to_user(buf, &kevent->event, event_size)) {
+			ret = -EFAULT;
+			break;
+		}
+		buf += event_size;
+		count -= event_size;
+
+		if (kevent->name) {
+			if (copy_to_user(buf, kevent->name, kevent->event.len)){
+				ret = -EFAULT;
+				break;
+			}
+			buf += kevent->event.len;
+			count -= kevent->event.len;
+		}
+
+		remove_kevent(dev, kevent);
+	}
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct inotify_device *dev = file->private_data;
+
+	inotify_destroy(dev->ih);
+
+	/* destroy all of the events on this device */
+	mutex_lock(&dev->ev_mutex);
+	while (!list_empty(&dev->events))
+		inotify_dev_event_dequeue(dev);
+	mutex_unlock(&dev->ev_mutex);
+
+	/* free this device: the put matching the get in inotify_init() */
+	put_inotify_dev(dev);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inotify_device *dev;
+	void __user *p;
+	int ret = -ENOTTY;
+
+	dev = file->private_data;
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		ret = put_user(dev->queue_size, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations inotify_fops = {
+	.poll           = inotify_poll,
+	.read           = inotify_read,
+	.release        = inotify_release,
+	.unlocked_ioctl = inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+};
+
+static const struct inotify_operations inotify_user_ops = {
+	.handle_event	= inotify_dev_queue_event,
+	.destroy_watch	= free_inotify_user_watch,
+};
+
+asmlinkage long sys_inotify_init(void)
+{
+	struct inotify_device *dev;
+	struct inotify_handle *ih;
+	struct user_struct *user;
+	struct file *filp;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	filp = get_empty_filp();
+	if (!filp) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	user = get_uid(current->user);
+	if (unlikely(atomic_read(&user->inotify_devs) >=
+			inotify_max_user_instances)) {
+		ret = -EMFILE;
+		goto out_free_uid;
+	}
+
+	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+	if (unlikely(!dev)) {
+		ret = -ENOMEM;
+		goto out_free_uid;
+	}
+
+	ih = inotify_init(&inotify_user_ops);
+	if (unlikely(IS_ERR(ih))) {
+		ret = PTR_ERR(ih);
+		goto out_free_dev;
+	}
+	dev->ih = ih;
+
+	filp->f_op = &inotify_fops;
+	filp->f_vfsmnt = mntget(inotify_mnt);
+	filp->f_dentry = dget(inotify_mnt->mnt_root);
+	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+	filp->f_mode = FMODE_READ;
+	filp->f_flags = O_RDONLY;
+	filp->private_data = dev;
+
+	INIT_LIST_HEAD(&dev->events);
+	init_waitqueue_head(&dev->wq);
+	mutex_init(&dev->ev_mutex);
+	mutex_init(&dev->up_mutex);
+	dev->event_count = 0;
+	dev->queue_size = 0;
+	dev->max_events = inotify_max_queued_events;
+	dev->user = user;
+	atomic_set(&dev->count, 0);
+
+	get_inotify_dev(dev);
+	atomic_inc(&user->inotify_devs);
+	fd_install(fd, filp);
+
+	return fd;
+out_free_dev:
+	kfree(dev);
+out_free_uid:
+	free_uid(user);
+	put_filp(filp);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+{
+	struct inode *inode;
+	struct inotify_device *dev;
+	struct nameidata nd;
+	struct file *filp;
+	int ret, fput_needed;
+	unsigned flags = 0;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
+
+	if (!(mask & IN_DONT_FOLLOW))
+		flags |= LOOKUP_FOLLOW;
+	if (mask & IN_ONLYDIR)
+		flags |= LOOKUP_DIRECTORY;
+
+	ret = find_inode(path, &nd, flags);
+	if (unlikely(ret))
+		goto fput_and_out;
+
+	/* inode held in place by reference to nd; dev by fget on fd */
+	inode = nd.dentry->d_inode;
+	dev = filp->private_data;
+
+	mutex_lock(&dev->up_mutex);
+	ret = inotify_find_update_watch(dev->ih, inode, mask);
+	if (ret == -ENOENT)
+		ret = create_watch(dev, inode, mask);
+	mutex_unlock(&dev->up_mutex);
+
+	path_release(&nd);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+	struct file *filp;
+	struct inotify_device *dev;
+	int ret, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dev = filp->private_data;
+
+	/* we free our watch data when we get IN_IGNORED */
+	ret = inotify_rm_wd(dev->ih, wd);
+
+out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+static int
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+	       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA, mnt);
+}
+
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+	int ret;
+
+	ret = register_filesystem(&inotify_fs_type);
+	if (unlikely(ret))
+		panic("inotify: register_filesystem returned %d!\n", ret);
+
+	inotify_mnt = kern_mount(&inotify_fs_type);
+	if (IS_ERR(inotify_mnt))
+		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+
+	inotify_max_queued_events = 16384;
+	inotify_max_user_instances = 128;
+	inotify_max_user_watches = 8192;
+
+	watch_cachep = kmem_cache_create("inotify_watch_cache",
+					 sizeof(struct inotify_user_watch),
+					 0, SLAB_PANIC, NULL, NULL);
+	event_cachep = kmem_cache_create("inotify_event_cache",
+					 sizeof(struct inotify_kernel_event),
+					 0, SLAB_PANIC, NULL, NULL);
+
+	return 0;
+}
+
+module_init(inotify_user_setup);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index ca77008146c0..7fa76ed53c10 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -24,15 +24,21 @@
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/security.h>
 
 static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
+	int err;
 	struct io_context *ioc;
 
 	if (task->uid != current->euid &&
 	    task->uid != current->uid && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
+	err = security_task_setioprio(task, ioprio);
+	if (err)
+		return err;
+
 	task_lock(task);
 
 	task->ioprio = ioprio;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 70adbb98bad1..3f9c8ba1fa1f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -56,7 +56,7 @@ static void isofs_put_super(struct super_block *sb)
 }
 
 static void isofs_read_inode(struct inode *);
-static int isofs_statfs (struct super_block *, struct kstatfs *);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
 
 static kmem_cache_t *isofs_inode_cachep;
 
@@ -901,8 +901,10 @@ out_freesbi:
 	return -EINVAL;
 }
 
-static int isofs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = ISOFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (ISOFS_SB(sb)->s_nzones
@@ -1399,10 +1401,11 @@ struct inode *isofs_iget(struct super_block *sb,
 	return inode;
 }
 
-static struct super_block *isofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int isofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type iso9660_fs_type = {
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 3f5102b069db..47678a26c13b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,67 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction.
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
-
-static inline void __buffer_unlink(struct journal_head *jh)
+static inline void __buffer_unlink_first(struct journal_head *jh)
 {
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
+	transaction_t *transaction = jh->b_cp_transaction;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +95,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +154,54 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction.
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
 		/*
-		 * This is foul
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
-		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
-		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
-		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +211,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +227,43 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		atomic_inc(&bh->b_count);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+		ret = 1;
+	} else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +276,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +314,68 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (!retry && lock_need_resched(&journal->j_list_lock)){
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble.
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -456,52 +460,98 @@ int cleanup_journal_tail(journal_t *journal)
 /* Checkpoint list management */
 
 /*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret = 0;
+	int released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +566,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +592,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +621,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +686,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 002ad2bbc769..0971814c38b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -790,11 +790,22 @@ restart_loop:
 			jbd_unlock_bh_state(bh);
 		} else {
 			J_ASSERT_BH(bh, !buffer_dirty(bh));
-			J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-			__journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);  /* needs a brelse */
-			release_buffer_page(bh);
+			/* The buffer on BJ_Forget list and not jbddirty means
+			 * it has been freed by this transaction and hence it
+			 * could not have been reallocated until this
+			 * transaction has committed. *BUT* it could be
+			 * reallocated once we have written all the data to
+			 * disk and before we process the buffer on BJ_Forget
+			 * list. */
+			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+			__journal_refile_buffer(jh);
+			if (!jh->b_transaction) {
+				jbd_unlock_bh_state(bh);
+				 /* needs a brelse */
+				journal_remove_journal_head(bh);
+				release_buffer_page(bh);
+			} else
+				jbd_unlock_bh_state(bh);
 		}
 		cond_resched_lock(&journal->j_list_lock);
 	}
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 80d7f53fd0a7..de5bafb4e853 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -531,6 +531,7 @@ static int do_one_pass(journal_t *journal,
 		default:
 			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
 				  blocktype);
+			brelse(bh);
 			goto done;
 		}
 	}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c609f5034fcd..508b2ea91f43 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -227,7 +227,8 @@ repeat_locked:
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
 out:
-	kfree(new_transaction);
+	if (unlikely(new_transaction))		/* It's usually NULL */
+		kfree(new_transaction);
 	return ret;
 }
 
@@ -724,7 +725,8 @@ done:
 	journal_cancel_revoke(handle, jh);
 
 out:
-	kfree(frozen_buffer);
+	if (unlikely(frozen_buffer))	/* It's usually NULL */
+		kfree(frozen_buffer);
 
 	JBUFFER_TRACE(jh, "exit");
 	return error;
@@ -903,7 +905,8 @@ repeat:
 	jbd_unlock_bh_state(bh);
 out:
 	journal_put_journal_head(jh);
-	kfree(committed_data);
+	if (unlikely(committed_data))
+		kfree(committed_data);
 	return err;
 }
 
@@ -2038,7 +2041,8 @@ void __journal_refile_buffer(struct journal_head *jh)
 	__journal_temp_unlink_buffer(jh);
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
-	__journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
+	__journal_file_buffer(jh, jh->b_transaction,
+				was_dirty ? BJ_Metadata : BJ_Reserved);
 	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
 
 	if (was_dirty)
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 020cc097c539..9e46ea6da752 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -377,9 +377,9 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 
 /* Get statistics of the file system.  */
 static int
-jffs_statfs(struct super_block *sb, struct kstatfs *buf)
+jffs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs_control *c = (struct jffs_control *) sb->s_fs_info;
+	struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info;
 	struct jffs_fmcontrol *fmc;
 
 	lock_kernel();
@@ -1785,10 +1785,11 @@ static struct super_operations jffs_ops =
 	.remount_fs	= jffs_remount,
 };
 
-static struct super_block *jffs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int jffs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type jffs_fs_type = {
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 0ef207dfaf6f..5371a403130a 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -247,7 +247,7 @@ flash_safe_read(struct mtd_info *mtd, loff_t from,
 	D3(printk(KERN_NOTICE "flash_safe_read(%p, %08x, %p, %08x)\n",
 		  mtd, (unsigned int) from, buf, count));
 
-	res = MTD_READ(mtd, from, count, &retlen, buf);
+	res = mtd->read(mtd, from, count, &retlen, buf);
 	if (retlen != count) {
 		panic("Didn't read all bytes in flash_safe_read(). Returned %d\n", res);
 	}
@@ -262,7 +262,7 @@ flash_read_u32(struct mtd_info *mtd, loff_t from)
 	__u32 ret;
 	int res;
 
-	res = MTD_READ(mtd, from, 4, &retlen, (unsigned char *)&ret);
+	res = mtd->read(mtd, from, 4, &retlen, (unsigned char *)&ret);
 	if (retlen != 4) {
 		printk("Didn't read all bytes in flash_read_u32(). Returned %d\n", res);
 		return 0;
@@ -282,7 +282,7 @@ flash_safe_write(struct mtd_info *mtd, loff_t to,
 	D3(printk(KERN_NOTICE "flash_safe_write(%p, %08x, %p, %08x)\n",
 		  mtd, (unsigned int) to, buf, count));
 
-	res = MTD_WRITE(mtd, to, count, &retlen, buf);
+	res = mtd->write(mtd, to, count, &retlen, buf);
 	if (retlen != count) {
 		printk("Didn't write all bytes in flash_safe_write(). Returned %d\n", res);
 	}
@@ -300,9 +300,9 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
 
 	D3(printk(KERN_NOTICE "flash_safe_writev(%p, %08x, %p)\n",
 		  mtd, (unsigned int) to, vecs));
-	
+
 	if (mtd->writev) {
-		res = MTD_WRITEV(mtd, vecs, iovec_cnt, to, &retlen);
+		res = mtd->writev(mtd, vecs, iovec_cnt, to, &retlen);
 		return res ? res : retlen;
 	}
 	/* Not implemented writev. Repeatedly use write - on the not so
@@ -312,7 +312,8 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	retlen=0;
 
 	for (i=0; !res && i<iovec_cnt; i++) {
-		res = MTD_WRITE(mtd, to, vecs[i].iov_len, &retlen_a, vecs[i].iov_base);
+		res = mtd->write(mtd, to, vecs[i].iov_len, &retlen_a,
+				 vecs[i].iov_base);
 		if (retlen_a != vecs[i].iov_len) {
 			printk("Didn't write all bytes in flash_safe_writev(). Returned %d\n", res);
 			if (i != iovec_cnt-1)
@@ -393,7 +394,7 @@ flash_erase_region(struct mtd_info *mtd, loff_t start,
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	add_wait_queue(&wait_q, &wait);
 
-	if (MTD_ERASE(mtd, erase) < 0) {
+	if (mtd->erase(mtd, erase) < 0) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
 		kfree(erase);
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 77dc5561a04e..7f28ee0bd132 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -12,6 +12,9 @@ jffs2-y	+= symlink.o build.o erase.o background.o fs.o writev.o
 jffs2-y	+= super.o debug.o
 
 jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER)	+= wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_XATTR)		+= xattr.o xattr_trusted.o xattr_user.o
+jffs2-$(CONFIG_JFFS2_FS_SECURITY)	+= security.o
+jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL)	+= acl.o
 jffs2-$(CONFIG_JFFS2_RUBIN)	+= compr_rubin.o
 jffs2-$(CONFIG_JFFS2_RTIME)	+= compr_rtime.o
 jffs2-$(CONFIG_JFFS2_ZLIB)	+= compr_zlib.o
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index b7943439b6ec..c8f0bd64e53e 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -150,3 +150,24 @@ the buffer.
 
 Ordering constraints:
 	Lock wbuf_sem last, after the alloc_sem or and f->sem.
+
+
+	c->xattr_sem
+	------------
+
+This read/write semaphore protects against concurrent access to the
+xattr related objects which include stuff in superblock and ic->xref.
+In read-only path, write-semaphore is too much exclusion. It's enough
+by read-semaphore. But you must hold write-semaphore when updating,
+creating or deleting any xattr related object.
+
+Once xattr_sem released, there would be no assurance for the existence
+of those objects. Thus, a series of processes is often required to retry,
+when updating such a object is necessary under holding read semaphore.
+For example, do_jffs2_getxattr() holds read-semaphore to scan xref and
+xdatum at first. But it retries this process with holding write-semaphore
+after release read-semaphore, if it's necessary to load name/value pair
+from medium.
+
+Ordering constraints:
+	Lock xattr_sem last, after the alloc_sem.
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
new file mode 100644
index 000000000000..320dd48b834e
--- /dev/null
+++ b/fs/jffs2/acl.c
@@ -0,0 +1,485 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static size_t jffs2_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(struct jffs2_acl_header)
+		       + count * sizeof(struct jffs2_acl_entry_short);
+	} else {
+		return sizeof(struct jffs2_acl_header)
+		       + 4 * sizeof(struct jffs2_acl_entry_short)
+		       + (count - 4) * sizeof(struct jffs2_acl_entry);
+	}
+}
+
+static int jffs2_acl_count(size_t size)
+{
+	size_t s;
+
+	size -= sizeof(struct jffs2_acl_header);
+	s = size - 4 * sizeof(struct jffs2_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(struct jffs2_acl_entry_short))
+			return -1;
+		return size / sizeof(struct jffs2_acl_entry_short);
+	} else {
+		if (s % sizeof(struct jffs2_acl_entry))
+			return -1;
+		return s / sizeof(struct jffs2_acl_entry) + 4;
+	}
+}
+
+static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size)
+{
+	void *end = value + size;
+	struct jffs2_acl_header *header = value;
+	struct jffs2_acl_entry *entry;
+	struct posix_acl *acl;
+	uint32_t ver;
+	int i, count;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct jffs2_acl_header))
+		return ERR_PTR(-EINVAL);
+	ver = je32_to_cpu(header->a_version);
+	if (ver != JFFS2_ACL_VERSION) {
+		JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
+		return ERR_PTR(-EINVAL);
+	}
+
+	value += sizeof(struct jffs2_acl_header);
+	count = jffs2_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i=0; i < count; i++) {
+		entry = value;
+		if (value + sizeof(struct jffs2_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
+		acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[i].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value += sizeof(struct jffs2_acl_entry_short);
+				acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value += sizeof(struct jffs2_acl_entry);
+				if (value > end)
+					goto fail;
+				acl->a_entries[i].e_id = je32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+ fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
+{
+	struct jffs2_acl_header *header;
+	struct jffs2_acl_entry *entry;
+	void *e;
+	size_t i;
+
+	*size = jffs2_acl_size(acl->a_count);
+	header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL);
+	if (!header)
+		return ERR_PTR(-ENOMEM);
+	header->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
+	e = header + 1;
+	for (i=0; i < acl->a_count; i++) {
+		entry = e;
+		entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag);
+		entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm);
+		switch(acl->a_entries[i].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id = cpu_to_je32(acl->a_entries[i].e_id);
+				e += sizeof(struct jffs2_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(struct jffs2_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return header;
+ fail:
+	kfree(header);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = JFFS2_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != JFFS2_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+	return acl;
+}
+
+static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != JFFS2_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct posix_acl *acl;
+	char *value = NULL;
+	int rc, xprefix;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		acl = jffs2_iget_acl(inode, &f->i_acl_access);
+		if (acl != JFFS2_ACL_NOT_CACHED)
+			return acl;
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		acl = jffs2_iget_acl(inode, &f->i_acl_default);
+		if (acl != JFFS2_ACL_NOT_CACHED)
+			return acl;
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
+	if (rc > 0) {
+		value = kmalloc(rc, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		rc = do_jffs2_getxattr(inode, xprefix, "", value, rc);
+	}
+	if (rc > 0) {
+		acl = jffs2_acl_from_medium(value, rc);
+	} else if (rc == -ENODATA || rc == -ENOSYS) {
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(rc);
+	}
+	if (value)
+		kfree(value);
+	if (!IS_ERR(acl)) {
+		switch (type) {
+		case ACL_TYPE_ACCESS:
+			jffs2_iset_acl(inode, &f->i_acl_access, acl);
+			break;
+		case ACL_TYPE_DEFAULT:
+			jffs2_iset_acl(inode, &f->i_acl_default, acl);
+			break;
+		}
+	}
+	return acl;
+}
+
+static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	size_t size = 0;
+	char *value = NULL;
+	int rc, xprefix;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			rc = posix_acl_equiv_mode(acl, &mode);
+			if (rc < 0)
+				return rc;
+			if (inode->i_mode != mode) {
+				inode->i_mode = mode;
+				jffs2_dirty_inode(inode);
+			}
+			if (rc == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (acl) {
+		value = jffs2_acl_to_medium(acl, &size);
+		if (IS_ERR(value))
+			return PTR_ERR(value);
+	}
+
+	rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0);
+	if (value)
+		kfree(value);
+	if (!rc) {
+		switch(type) {
+		case ACL_TYPE_ACCESS:
+			jffs2_iset_acl(inode, &f->i_acl_access, acl);
+			break;
+		case ACL_TYPE_DEFAULT:
+			jffs2_iset_acl(inode, &f->i_acl_default, acl);
+			break;
+		}
+	}
+	return rc;
+}
+
+static int jffs2_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		rc = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return rc;
+	}
+	return -EAGAIN;
+}
+
+int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, jffs2_check_acl);
+}
+
+int jffs2_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct posix_acl *acl = NULL, *clone;
+	mode_t mode;
+	int rc = 0;
+
+	f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+	if (!S_ISLNK(inode->i_mode)) {
+		acl = jffs2_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+	if (acl) {
+		if (S_ISDIR(inode->i_mode)) {
+			rc = jffs2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			if (rc)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		rc = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+		mode = inode->i_mode;
+		rc = posix_acl_create_masq(clone, &mode);
+		if (rc >= 0) {
+			inode->i_mode = mode;
+			if (rc > 0)
+				rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+		}
+		posix_acl_release(clone);
+	}
+ cleanup:
+	posix_acl_release(acl);
+	return rc;
+}
+
+void jffs2_clear_acl(struct inode *inode)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+	if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
+		posix_acl_release(f->i_acl_access);
+		f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	}
+	if (f->i_acl_default && f->i_acl_default != JFFS2_ACL_NOT_CACHED) {
+		posix_acl_release(f->i_acl_default);
+		f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+	}
+}
+
+int jffs2_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int rc;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	rc = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!rc)
+		rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+	posix_acl_release(clone);
+	return rc;
+}
+
+static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size,
+					 const char *name, size_t name_len)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_ACCESS);
+	return retlen;
+}
+
+static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size,
+					  const char *name, size_t name_len)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_DEFAULT);
+	return retlen;
+}
+
+static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	acl = jffs2_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (!acl)
+		return -ENODATA;
+	rc = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return rc;
+}
+
+static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			rc = posix_acl_valid(acl);
+			if (rc)
+				goto out;
+		}
+	} else {
+		acl = NULL;
+	}
+	rc = jffs2_set_acl(inode, type, acl);
+ out:
+	posix_acl_release(acl);
+	return rc;
+}
+
+static int jffs2_acl_access_setxattr(struct inode *inode, const char *name,
+				     const void *buffer, size_t size, int flags)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int jffs2_acl_default_setxattr(struct inode *inode, const char *name,
+				      const void *buffer, size_t size, int flags)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+struct xattr_handler jffs2_acl_access_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.list	= jffs2_acl_access_listxattr,
+	.get	= jffs2_acl_access_getxattr,
+	.set	= jffs2_acl_access_setxattr,
+};
+
+struct xattr_handler jffs2_acl_default_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.list	= jffs2_acl_default_listxattr,
+	.get	= jffs2_acl_default_getxattr,
+	.set	= jffs2_acl_default_setxattr,
+};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
new file mode 100644
index 000000000000..8893bd1a6ba7
--- /dev/null
+++ b/fs/jffs2/acl.h
@@ -0,0 +1,45 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+struct jffs2_acl_entry {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+	jint32_t	e_id;
+};
+
+struct jffs2_acl_entry_short {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+};
+
+struct jffs2_acl_header {
+	jint32_t	a_version;
+};
+
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+
+#define JFFS2_ACL_NOT_CACHED ((void *)-1)
+
+extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_acl_chmod(struct inode *);
+extern int jffs2_init_acl(struct inode *, struct inode *);
+extern void jffs2_clear_acl(struct inode *);
+
+extern struct xattr_handler jffs2_acl_access_xattr_handler;
+extern struct xattr_handler jffs2_acl_default_xattr_handler;
+
+#else
+
+#define jffs2_permission NULL
+#define jffs2_acl_chmod(inode)		(0)
+#define jffs2_init_acl(inode,dir)	(0)
+#define jffs2_clear_acl(inode)
+
+#endif	/* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 70f7a896c04a..02826967ab58 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -160,6 +160,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 		ic->scan_dents = NULL;
 		cond_resched();
 	}
+	jffs2_build_xattr_subsystem(c);
 	c->flags &= ~JFFS2_SB_FLAG_BUILDING;
 
 	dbg_fsbuild("FS build complete\n");
@@ -178,6 +179,7 @@ exit:
 				jffs2_free_full_dirent(fd);
 			}
 		}
+		jffs2_clear_xattr_subsystem(c);
 	}
 
 	return ret;
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index e7944e665b9f..7001ba26c067 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -412,7 +412,7 @@ void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
                 kfree(comprbuf);
 }
 
-int jffs2_compressors_init(void)
+int __init jffs2_compressors_init(void)
 {
 /* Registering compressors */
 #ifdef CONFIG_JFFS2_ZLIB
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index a77e830d85c5..509b8b1c0811 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -23,8 +23,8 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
 #include "nodelist.h"
 
 #define JFFS2_RUBINMIPS_PRIORITY 10
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 1fe17de713e8..72b4fc13a106 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -192,13 +192,13 @@ __jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c,
 		else
 			my_dirty_size += totlen;
 
-		if ((!ref2->next_phys) != (ref2 == jeb->last_node)) {
-			JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next_phys at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
-				ref_offset(ref2), ref2, ref_offset(ref2->next_phys), ref2->next_phys,
-				ref_offset(jeb->last_node), jeb->last_node);
+		if ((!ref_next(ref2)) != (ref2 == jeb->last_node)) {
+			JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
+				    ref_offset(ref2), ref2, ref_offset(ref_next(ref2)), ref_next(ref2),
+				    ref_offset(jeb->last_node), jeb->last_node);
 			goto error;
 		}
-		ref2 = ref2->next_phys;
+		ref2 = ref_next(ref2);
 	}
 
 	if (my_used_size != jeb->used_size) {
@@ -268,9 +268,9 @@ __jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c,
 	}
 
 	printk(JFFS2_DBG);
-	for (ref = jeb->first_node; ; ref = ref->next_phys) {
+	for (ref = jeb->first_node; ; ref = ref_next(ref)) {
 		printk("%#08x(%#x)", ref_offset(ref), ref->__totlen);
-		if (ref->next_phys)
+		if (ref_next(ref))
 			printk("->");
 		else
 			break;
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index 162af6dfe292..5fa494a792b2 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -171,6 +171,12 @@
 #define dbg_memalloc(fmt, ...)
 #endif
 
+/* Watch the XATTR subsystem */
+#ifdef JFFS2_DBG_XATTR_MESSAGES
+#define dbg_xattr(fmt, ...)  JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_xattr(fmt, ...)
+#endif 
 
 /* "Sanity" checks */
 void
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 8bc7a5018e40..edd8371fc6a5 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -17,8 +17,8 @@
 #include <linux/fs.h>
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
 #include <linux/time.h>
 #include "nodelist.h"
 
@@ -57,7 +57,12 @@ struct inode_operations jffs2_dir_inode_operations =
 	.rmdir =	jffs2_rmdir,
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
+	.permission =	jffs2_permission,
 	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 /***********************************************************************/
@@ -78,6 +83,9 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 
 	D1(printk(KERN_DEBUG "jffs2_lookup()\n"));
 
+	if (target->d_name.len > JFFS2_MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
 	dir_f = JFFS2_INODE_INFO(dir_i);
 	c = JFFS2_SB_INFO(dir_i->i_sb);
 
@@ -206,12 +214,15 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	ret = jffs2_do_create(c, dir_f, f, ri,
 			      dentry->d_name.name, dentry->d_name.len);
 
-	if (ret) {
-		make_bad_inode(inode);
-		iput(inode);
-		jffs2_free_raw_inode(ri);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret)
+		goto fail;
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret)
+		goto fail;
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
 
@@ -221,6 +232,12 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
 		  inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, inode->i_mapping->nrpages));
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	iput(inode);
+	jffs2_free_raw_inode(ri);
+	return ret;
 }
 
 /***********************************************************************/
@@ -291,7 +308,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
 	int namelen;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret, targetlen = strlen(target);
 
 	/* FIXME: If you care. We'd need to use frags for the target
@@ -310,8 +327,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
-	ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
 		jffs2_free_raw_inode(ri);
@@ -339,7 +356,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	ri->data_crc = cpu_to_je32(crc32(0, target, targetlen));
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, target, targetlen, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, target, targetlen, ALLOC_NORMAL);
 
 	jffs2_free_raw_inode(ri);
 
@@ -371,8 +388,20 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		/* Eep. */
 		jffs2_clear_inode(inode);
@@ -404,7 +433,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
 
 	if (IS_ERR(fd)) {
 		/* dirent failed to write. Delete the inode normally
@@ -442,7 +471,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
 	int namelen;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	mode |= S_IFDIR;
@@ -457,8 +486,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
-	ret = jffs2_reserve_space(c, sizeof(*ri), &phys_ofs, &alloclen, ALLOC_NORMAL,
-				JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
+				  JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
 		jffs2_free_raw_inode(ri);
@@ -483,7 +512,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	ri->data_crc = cpu_to_je32(0);
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
 
 	jffs2_free_raw_inode(ri);
 
@@ -501,8 +530,20 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		/* Eep. */
 		jffs2_clear_inode(inode);
@@ -534,7 +575,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
 
 	if (IS_ERR(fd)) {
 		/* dirent failed to write. Delete the inode normally
@@ -588,12 +629,12 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
 	int namelen;
-	jint16_t dev;
+	union jffs2_device_node dev;
 	int devlen = 0;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
-	if (!old_valid_dev(rdev))
+	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
 	ri = jffs2_alloc_raw_inode();
@@ -602,17 +643,15 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 
 	c = JFFS2_SB_INFO(dir_i->i_sb);
 
-	if (S_ISBLK(mode) || S_ISCHR(mode)) {
-		dev = cpu_to_je16(old_encode_dev(rdev));
-		devlen = sizeof(dev);
-	}
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		devlen = jffs2_encode_dev(&dev, rdev);
 
 	/* Try to reserve enough space for both node and dirent.
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
-	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
 		jffs2_free_raw_inode(ri);
@@ -639,7 +678,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	ri->data_crc = cpu_to_je32(crc32(0, &dev, devlen));
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, ALLOC_NORMAL);
 
 	jffs2_free_raw_inode(ri);
 
@@ -657,8 +696,20 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		/* Eep. */
 		jffs2_clear_inode(inode);
@@ -693,7 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
 
 	if (IS_ERR(fd)) {
 		/* dirent failed to write. Delete the inode normally
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dad68fdffe9e..1862e8bc101d 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -30,7 +30,6 @@ static void jffs2_erase_callback(struct erase_info *);
 #endif
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
-static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 
 static void jffs2_erase_block(struct jffs2_sb_info *c,
@@ -136,7 +135,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 			c->used_size -= jeb->used_size;
 			c->dirty_size -= jeb->dirty_size;
 			jeb->wasted_size = jeb->used_size = jeb->dirty_size = jeb->free_size = 0;
-			jffs2_free_all_node_refs(c, jeb);
+			jffs2_free_jeb_node_refs(c, jeb);
 			list_add(&jeb->list, &c->erasing_list);
 			spin_unlock(&c->erase_completion_lock);
 
@@ -231,6 +230,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 			   at the end of the linked list. Stash it and continue
 			   from the beginning of the list */
 			ic = (struct jffs2_inode_cache *)(*prev);
+			BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 			prev = &ic->nodes;
 			continue;
 		}
@@ -283,22 +283,27 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 		jffs2_del_ino_cache(c, ic);
 }
 
-static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
-	struct jffs2_raw_node_ref *ref;
+	struct jffs2_raw_node_ref *block, *ref;
 	D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset));
-	while(jeb->first_node) {
-		ref = jeb->first_node;
-		jeb->first_node = ref->next_phys;
 
-		/* Remove from the inode-list */
-		if (ref->next_in_ino)
+	block = ref = jeb->first_node;
+
+	while (ref) {
+		if (ref->flash_offset == REF_LINK_NODE) {
+			ref = ref->next_in_ino;
+			jffs2_free_refblock(block);
+			block = ref;
+			continue;
+		}
+		if (ref->flash_offset != REF_EMPTY_NODE && ref->next_in_ino)
 			jffs2_remove_node_refs_from_ino_list(c, ref, jeb);
 		/* else it was a non-inode node or already removed, so don't bother */
 
-		jffs2_free_raw_node_ref(ref);
+		ref++;
 	}
-	jeb->last_node = NULL;
+	jeb->first_node = jeb->last_node = NULL;
 }
 
 static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset)
@@ -351,7 +356,6 @@ fail:
 
 static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
-	struct jffs2_raw_node_ref *marker_ref = NULL;
 	size_t retlen;
 	int ret;
 	uint32_t bad_offset;
@@ -373,12 +377,8 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 				goto filebad;
 		}
 
-		jeb->first_node = jeb->last_node = NULL;
+		/* Everything else got zeroed before the erase */
 		jeb->free_size = c->sector_size;
-		jeb->used_size = 0;
-		jeb->dirty_size = 0;
-		jeb->wasted_size = 0;
-
 	} else {
 
 		struct kvec vecs[1];
@@ -388,11 +388,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 			.totlen =	cpu_to_je32(c->cleanmarker_size)
 		};
 
-		marker_ref = jffs2_alloc_raw_node_ref();
-		if (!marker_ref) {
-			printk(KERN_WARNING "Failed to allocate raw node ref for clean marker. Refiling\n");
-			goto refile;
-		}
+		jffs2_prealloc_raw_node_refs(c, jeb, 1);
 
 		marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
 
@@ -408,21 +404,13 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 				printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
 				       jeb->offset, sizeof(marker), retlen);
 
-			jffs2_free_raw_node_ref(marker_ref);
 			goto filebad;
 		}
 
-		marker_ref->next_in_ino = NULL;
-		marker_ref->next_phys = NULL;
-		marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-		marker_ref->__totlen = c->cleanmarker_size;
-
-		jeb->first_node = jeb->last_node = marker_ref;
-
-		jeb->free_size = c->sector_size - c->cleanmarker_size;
-		jeb->used_size = c->cleanmarker_size;
-		jeb->dirty_size = 0;
-		jeb->wasted_size = 0;
+		/* Everything else got zeroed before the erase */
+		jeb->free_size = c->sector_size;
+		/* FIXME Special case for cleanmarker in empty block */
+		jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL);
 	}
 
 	spin_lock(&c->erase_completion_lock);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 9f4171213e58..bb8844f40e48 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -54,7 +54,12 @@ const struct file_operations jffs2_file_operations =
 
 struct inode_operations jffs2_file_inode_operations =
 {
-	.setattr =	jffs2_setattr
+	.permission =	jffs2_permission,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 struct address_space_operations jffs2_file_address_operations =
@@ -129,13 +134,13 @@ static int jffs2_prepare_write (struct file *filp, struct page *pg,
 		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 		struct jffs2_raw_inode ri;
 		struct jffs2_full_dnode *fn;
-		uint32_t phys_ofs, alloc_len;
+		uint32_t alloc_len;
 
 		D1(printk(KERN_DEBUG "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
 			  (unsigned int)inode->i_size, pageofs));
 
-		ret = jffs2_reserve_space(c, sizeof(ri), &phys_ofs, &alloc_len,
-					ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 		if (ret)
 			return ret;
 
@@ -161,7 +166,7 @@ static int jffs2_prepare_write (struct file *filp, struct page *pg,
 		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 		ri.data_crc = cpu_to_je32(0);
 
-		fn = jffs2_write_dnode(c, f, &ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+		fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_NORMAL);
 
 		if (IS_ERR(fn)) {
 			ret = PTR_ERR(fn);
@@ -215,12 +220,20 @@ static int jffs2_commit_write (struct file *filp, struct page *pg,
 	D1(printk(KERN_DEBUG "jffs2_commit_write(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
 		  inode->i_ino, pg->index << PAGE_CACHE_SHIFT, start, end, pg->flags));
 
-	if (!start && end == PAGE_CACHE_SIZE) {
-		/* We need to avoid deadlock with page_cache_read() in
-		   jffs2_garbage_collect_pass(). So we have to mark the
-		   page up to date, to prevent page_cache_read() from
-		   trying to re-lock it. */
-		SetPageUptodate(pg);
+	if (end == PAGE_CACHE_SIZE) {
+		if (!start) {
+			/* We need to avoid deadlock with page_cache_read() in
+			   jffs2_garbage_collect_pass(). So we have to mark the
+			   page up to date, to prevent page_cache_read() from
+			   trying to re-lock it. */
+			SetPageUptodate(pg);
+		} else {
+			/* When writing out the end of a page, write out the 
+			   _whole_ page. This helps to reduce the number of
+			   nodes in files which have many short writes, like
+			   syslog files. */
+			start = aligned_start = 0;
+		}
 	}
 
 	ri = jffs2_alloc_raw_inode();
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09e5d10b8840..2900ec3ec3af 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -33,11 +33,11 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_raw_inode *ri;
-	unsigned short dev;
+	union jffs2_device_node dev;
 	unsigned char *mdata = NULL;
 	int mdatalen = 0;
 	unsigned int ivalid;
-	uint32_t phys_ofs, alloclen;
+	uint32_t alloclen;
 	int ret;
 	D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino));
 	ret = inode_change_ok(inode, iattr);
@@ -51,20 +51,24 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	   it out again with the appropriate data attached */
 	if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
 		/* For these, we don't actually need to read the old node */
-		dev = old_encode_dev(inode->i_rdev);
+		mdatalen = jffs2_encode_dev(&dev, inode->i_rdev);
 		mdata = (char *)&dev;
-		mdatalen = sizeof(dev);
 		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen));
 	} else if (S_ISLNK(inode->i_mode)) {
+		down(&f->sem);
 		mdatalen = f->metadata->size;
 		mdata = kmalloc(f->metadata->size, GFP_USER);
-		if (!mdata)
+		if (!mdata) {
+			up(&f->sem);
 			return -ENOMEM;
+		}
 		ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen);
 		if (ret) {
+			up(&f->sem);
 			kfree(mdata);
 			return ret;
 		}
+		up(&f->sem);
 		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen));
 	}
 
@@ -75,8 +79,8 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 		return -ENOMEM;
 	}
 
-	ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 	if (ret) {
 		jffs2_free_raw_inode(ri);
 		if (S_ISLNK(inode->i_mode & S_IFMT))
@@ -127,7 +131,7 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	else
 		ri->data_crc = cpu_to_je32(0);
 
-	new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, phys_ofs, ALLOC_NORMAL);
+	new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, ALLOC_NORMAL);
 	if (S_ISLNK(inode->i_mode))
 		kfree(mdata);
 
@@ -180,12 +184,17 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 
 int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
-	return jffs2_do_setattr(dentry->d_inode, iattr);
+	int rc;
+
+	rc = jffs2_do_setattr(dentry->d_inode, iattr);
+	if (!rc && (iattr->ia_valid & ATTR_MODE))
+		rc = jffs2_acl_chmod(dentry->d_inode);
+	return rc;
 }
 
-int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
 	unsigned long avail;
 
 	buf->f_type = JFFS2_SUPER_MAGIC;
@@ -219,6 +228,7 @@ void jffs2_clear_inode (struct inode *inode)
 
 	D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
 
+	jffs2_xattr_delete_inode(c, f->inocache);
 	jffs2_do_clear_inode(c, f);
 }
 
@@ -227,6 +237,8 @@ void jffs2_read_inode (struct inode *inode)
 	struct jffs2_inode_info *f;
 	struct jffs2_sb_info *c;
 	struct jffs2_raw_inode latest_node;
+	union jffs2_device_node jdev;
+	dev_t rdev = 0;
 	int ret;
 
 	D1(printk(KERN_DEBUG "jffs2_read_inode(): inode->i_ino == %lu\n", inode->i_ino));
@@ -258,7 +270,6 @@ void jffs2_read_inode (struct inode *inode)
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
 	switch (inode->i_mode & S_IFMT) {
-		jint16_t rdev;
 
 	case S_IFLNK:
 		inode->i_op = &jffs2_symlink_inode_operations;
@@ -292,8 +303,16 @@ void jffs2_read_inode (struct inode *inode)
 	case S_IFBLK:
 	case S_IFCHR:
 		/* Read the device numbers from the media */
+		if (f->metadata->size != sizeof(jdev.old) &&
+		    f->metadata->size != sizeof(jdev.new)) {
+			printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
+			up(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			make_bad_inode(inode);
+			return;
+		}
 		D1(printk(KERN_DEBUG "Reading device numbers from flash\n"));
-		if (jffs2_read_dnode(c, f, f->metadata, (char *)&rdev, 0, sizeof(rdev)) < 0) {
+		if (jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size) < 0) {
 			/* Eep */
 			printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
 			up(&f->sem);
@@ -301,12 +320,15 @@ void jffs2_read_inode (struct inode *inode)
 			make_bad_inode(inode);
 			return;
 		}
+		if (f->metadata->size == sizeof(jdev.old))
+			rdev = old_decode_dev(je16_to_cpu(jdev.old));
+		else
+			rdev = new_decode_dev(je32_to_cpu(jdev.new));
 
 	case S_IFSOCK:
 	case S_IFIFO:
 		inode->i_op = &jffs2_file_inode_operations;
-		init_special_inode(inode, inode->i_mode,
-				   old_decode_dev((je16_to_cpu(rdev))));
+		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 
 	default:
@@ -492,6 +514,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	memset(c->inocache_list, 0, INOCACHE_HASHSIZE * sizeof(struct jffs2_inode_cache *));
 
+	jffs2_init_xattr_subsystem(c);
+
 	if ((ret = jffs2_do_mount_fs(c)))
 		goto out_inohash;
 
@@ -526,6 +550,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		kfree(c->blocks);
  out_inohash:
+	jffs2_clear_xattr_subsystem(c);
 	kfree(c->inocache_list);
  out_wbuf:
 	jffs2_flash_cleanup(c);
@@ -639,13 +664,6 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
 			return ret;
 	}
 
-	/* add setups for other bizarre flashes here... */
-	if (jffs2_nor_ecc(c)) {
-		ret = jffs2_nor_ecc_flash_setup(c);
-		if (ret)
-			return ret;
-	}
-
 	/* and Dataflash */
 	if (jffs2_dataflash(c)) {
 		ret = jffs2_dataflash_setup(c);
@@ -669,11 +687,6 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
 		jffs2_nand_flash_cleanup(c);
 	}
 
-	/* add cleanups for other bizarre flashes here... */
-	if (jffs2_nor_ecc(c)) {
-		jffs2_nor_ecc_flash_cleanup(c);
-	}
-
 	/* and DataFlash */
 	if (jffs2_dataflash(c)) {
 		jffs2_dataflash_cleanup(c);
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f9ffece453a3..477c526d638b 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -125,6 +125,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	struct jffs2_eraseblock *jeb;
 	struct jffs2_raw_node_ref *raw;
 	int ret = 0, inum, nlink;
+	int xattr = 0;
 
 	if (down_interruptible(&c->alloc_sem))
 		return -EINTR;
@@ -138,7 +139,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 		   the node CRCs etc. Do it now. */
 
 		/* checked_ino is protected by the alloc_sem */
-		if (c->checked_ino > c->highest_ino) {
+		if (c->checked_ino > c->highest_ino && xattr) {
 			printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n",
 			       c->unchecked_size);
 			jffs2_dbg_dump_block_lists_nolock(c);
@@ -148,6 +149,9 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 		spin_unlock(&c->erase_completion_lock);
 
+		if (!xattr)
+			xattr = jffs2_verify_xattr(c);
+
 		spin_lock(&c->inocache_lock);
 
 		ic = jffs2_get_ino_cache(c, c->checked_ino++);
@@ -181,6 +185,10 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 			   and trigger the BUG() above while we haven't yet
 			   finished checking all its nodes */
 			D1(printk(KERN_DEBUG "Waiting for ino #%u to finish reading\n", ic->ino));
+			/* We need to come back again for the _same_ inode. We've
+			 made no progress in this case, but that should be OK */
+			c->checked_ino--;
+
 			up(&c->alloc_sem);
 			sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
 			return 0;
@@ -231,7 +239,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	while(ref_obsolete(raw)) {
 		D1(printk(KERN_DEBUG "Node at 0x%08x is obsolete... skipping\n", ref_offset(raw)));
-		raw = raw->next_phys;
+		raw = ref_next(raw);
 		if (unlikely(!raw)) {
 			printk(KERN_WARNING "eep. End of raw list while still supposedly nodes to GC\n");
 			printk(KERN_WARNING "erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n",
@@ -248,16 +256,37 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	if (!raw->next_in_ino) {
 		/* Inode-less node. Clean marker, snapshot or something like that */
-		/* FIXME: If it's something that needs to be copied, including something
-		   we don't grok that has JFFS2_NODETYPE_RWCOMPAT_COPY, we should do so */
 		spin_unlock(&c->erase_completion_lock);
-		jffs2_mark_node_obsolete(c, raw);
+		if (ref_flags(raw) == REF_PRISTINE) {
+			/* It's an unknown node with JFFS2_FEATURE_RWCOMPAT_COPY */
+			jffs2_garbage_collect_pristine(c, NULL, raw);
+		} else {
+			/* Just mark it obsolete */
+			jffs2_mark_node_obsolete(c, raw);
+		}
 		up(&c->alloc_sem);
 		goto eraseit_lock;
 	}
 
 	ic = jffs2_raw_ref_to_ic(raw);
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+	/* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr.
+	 * We can decide whether this node is inode or xattr by ic->class.     */
+	if (ic->class == RAWNODE_CLASS_XATTR_DATUM
+	    || ic->class == RAWNODE_CLASS_XATTR_REF) {
+		BUG_ON(raw->next_in_ino != (void *)ic);
+		spin_unlock(&c->erase_completion_lock);
+
+		if (ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+		} else {
+			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+		}
+		goto release_sem;
+	}
+#endif
+
 	/* We need to hold the inocache. Either the erase_completion_lock or
 	   the inocache_lock are sufficient; we trade down since the inocache_lock
 	   causes less contention. */
@@ -499,7 +528,6 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 					  struct jffs2_raw_node_ref *raw)
 {
 	union jffs2_node_union *node;
-	struct jffs2_raw_node_ref *nraw;
 	size_t retlen;
 	int ret;
 	uint32_t phys_ofs, alloclen;
@@ -508,15 +536,16 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 
 	D1(printk(KERN_DEBUG "Going to GC REF_PRISTINE node at 0x%08x\n", ref_offset(raw)));
 
-	rawlen = ref_totlen(c, c->gcblock, raw);
+	alloclen = rawlen = ref_totlen(c, c->gcblock, raw);
 
 	/* Ask for a small amount of space (or the totlen if smaller) because we
 	   don't want to force wastage of the end of a block if splitting would
 	   work. */
-	ret = jffs2_reserve_space_gc(c, min_t(uint32_t, sizeof(struct jffs2_raw_inode) +
-				JFFS2_MIN_DATA_LEN, rawlen), &phys_ofs, &alloclen, rawlen);
-				/* this is not the exact summary size of it,
-					it is only an upper estimation */
+	if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
+		alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN;
+
+	ret = jffs2_reserve_space_gc(c, alloclen, &alloclen, rawlen);
+	/* 'rawlen' is not the exact summary size; it is only an upper estimation */
 
 	if (ret)
 		return ret;
@@ -580,22 +609,17 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 		}
 		break;
 	default:
-		printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
-		       ref_offset(raw), je16_to_cpu(node->u.nodetype));
-		goto bail;
-	}
-
-	nraw = jffs2_alloc_raw_node_ref();
-	if (!nraw) {
-		ret = -ENOMEM;
-		goto out_node;
+		/* If it's inode-less, we don't _know_ what it is. Just copy it intact */
+		if (ic) {
+			printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
+			       ref_offset(raw), je16_to_cpu(node->u.nodetype));
+			goto bail;
+		}
 	}
 
 	/* OK, all the CRCs are good; this node can just be copied as-is. */
  retry:
-	nraw->flash_offset = phys_ofs;
-	nraw->__totlen = rawlen;
-	nraw->next_phys = NULL;
+	phys_ofs = write_ofs(c);
 
 	ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
 
@@ -603,17 +627,11 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 		printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
                        rawlen, phys_ofs, ret, retlen);
 		if (retlen) {
-                        /* Doesn't belong to any inode */
-			nraw->next_in_ino = NULL;
-
-			nraw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, nraw);
-			jffs2_mark_node_obsolete(c, nraw);
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL);
 		} else {
-			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", nraw->flash_offset);
-                        jffs2_free_raw_node_ref(nraw);
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", phys_ofs);
 		}
-		if (!retried && (nraw = jffs2_alloc_raw_node_ref())) {
+		if (!retried) {
 			/* Try to reallocate space and retry */
 			uint32_t dummy;
 			struct jffs2_eraseblock *jeb = &c->blocks[phys_ofs / c->sector_size];
@@ -625,7 +643,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 			jffs2_dbg_acct_sanity_check(c,jeb);
 			jffs2_dbg_acct_paranoia_check(c, jeb);
 
-			ret = jffs2_reserve_space_gc(c, rawlen, &phys_ofs, &dummy, rawlen);
+			ret = jffs2_reserve_space_gc(c, rawlen, &dummy, rawlen);
 						/* this is not the exact summary size of it,
 							it is only an upper estimation */
 
@@ -638,25 +656,13 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 				goto retry;
 			}
 			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-			jffs2_free_raw_node_ref(nraw);
 		}
 
-		jffs2_free_raw_node_ref(nraw);
 		if (!ret)
 			ret = -EIO;
 		goto out_node;
 	}
-	nraw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, nraw);
-
-	/* Link into per-inode list. This is safe because of the ic
-	   state being INO_STATE_GC. Note that if we're doing this
-	   for an inode which is in-core, the 'nraw' pointer is then
-	   going to be fetched from ic->nodes by our caller. */
-	spin_lock(&c->erase_completion_lock);
-        nraw->next_in_ino = ic->nodes;
-        ic->nodes = nraw;
-	spin_unlock(&c->erase_completion_lock);
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic);
 
 	jffs2_mark_node_obsolete(c, raw);
 	D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw)));
@@ -675,19 +681,16 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	struct jffs2_full_dnode *new_fn;
 	struct jffs2_raw_inode ri;
 	struct jffs2_node_frag *last_frag;
-	jint16_t dev;
+	union jffs2_device_node dev;
 	char *mdata = NULL, mdatalen = 0;
-	uint32_t alloclen, phys_ofs, ilen;
+	uint32_t alloclen, ilen;
 	int ret;
 
 	if (S_ISBLK(JFFS2_F_I_MODE(f)) ||
 	    S_ISCHR(JFFS2_F_I_MODE(f)) ) {
 		/* For these, we don't actually need to read the old node */
-		/* FIXME: for minor or major > 255. */
-		dev = cpu_to_je16(((JFFS2_F_I_RDEV_MAJ(f) << 8) |
-			JFFS2_F_I_RDEV_MIN(f)));
+		mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f));
 		mdata = (char *)&dev;
-		mdatalen = sizeof(dev);
 		D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen));
 	} else if (S_ISLNK(JFFS2_F_I_MODE(f))) {
 		mdatalen = fn->size;
@@ -706,7 +709,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 
 	}
 
-	ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen,
 				JFFS2_SUMMARY_INODE_SIZE);
 	if (ret) {
 		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n",
@@ -744,7 +747,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 	ri.data_crc = cpu_to_je32(crc32(0, mdata, mdatalen));
 
-	new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, phys_ofs, ALLOC_GC);
+	new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC);
 
 	if (IS_ERR(new_fn)) {
 		printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn));
@@ -765,7 +768,7 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er
 {
 	struct jffs2_full_dirent *new_fd;
 	struct jffs2_raw_dirent rd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	rd.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -787,14 +790,14 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er
 	rd.node_crc = cpu_to_je32(crc32(0, &rd, sizeof(rd)-8));
 	rd.name_crc = cpu_to_je32(crc32(0, fd->name, rd.nsize));
 
-	ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen,
 				JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize));
 	if (ret) {
 		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n",
 		       sizeof(rd)+rd.nsize, ret);
 		return ret;
 	}
-	new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, phys_ofs, ALLOC_GC);
+	new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC);
 
 	if (IS_ERR(new_fd)) {
 		printk(KERN_WARNING "jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", PTR_ERR(new_fd));
@@ -922,7 +925,7 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
 	struct jffs2_raw_inode ri;
 	struct jffs2_node_frag *frag;
 	struct jffs2_full_dnode *new_fn;
-	uint32_t alloclen, phys_ofs, ilen;
+	uint32_t alloclen, ilen;
 	int ret;
 
 	D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n",
@@ -1001,14 +1004,14 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
 	ri.data_crc = cpu_to_je32(0);
 	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 
-	ret = jffs2_reserve_space_gc(c, sizeof(ri), &phys_ofs, &alloclen,
-				JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen,
+				     JFFS2_SUMMARY_INODE_SIZE);
 	if (ret) {
 		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n",
 		       sizeof(ri), ret);
 		return ret;
 	}
-	new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, phys_ofs, ALLOC_GC);
+	new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC);
 
 	if (IS_ERR(new_fn)) {
 		printk(KERN_WARNING "Error writing new hole node: %ld\n", PTR_ERR(new_fn));
@@ -1070,7 +1073,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 {
 	struct jffs2_full_dnode *new_fn;
 	struct jffs2_raw_inode ri;
-	uint32_t alloclen, phys_ofs, offset, orig_end, orig_start;
+	uint32_t alloclen, offset, orig_end, orig_start;
 	int ret = 0;
 	unsigned char *comprbuf = NULL, *writebuf;
 	unsigned long pg;
@@ -1227,7 +1230,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 		uint32_t cdatalen;
 		uint16_t comprtype = JFFS2_COMPR_NONE;
 
-		ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN, &phys_ofs,
+		ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN,
 					&alloclen, JFFS2_SUMMARY_INODE_SIZE);
 
 		if (ret) {
@@ -1264,7 +1267,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 		ri.data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
 
-		new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, phys_ofs, ALLOC_GC);
+		new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, ALLOC_GC);
 
 		jffs2_free_comprbuf(comprbuf, writebuf);
 
diff --git a/fs/jffs2/histo.h b/fs/jffs2/histo.h
deleted file mode 100644
index 22a93a08210c..000000000000
--- a/fs/jffs2/histo.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* This file provides the bit-probabilities for the input file */
-#define BIT_DIVIDER 629
-static int bits[9] = { 179,167,183,165,159,198,178,119,}; /* ia32 .so files */
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
new file mode 100644
index 000000000000..2e0cc8e00b85
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -0,0 +1,55 @@
+/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */
+
+#ifndef _JFFS2_FS_I
+#define _JFFS2_FS_I
+
+#include <linux/version.h>
+#include <linux/rbtree.h>
+#include <linux/posix_acl.h>
+#include <asm/semaphore.h>
+
+struct jffs2_inode_info {
+	/* We need an internal mutex similar to inode->i_mutex.
+	   Unfortunately, we can't used the existing one, because
+	   either the GC would deadlock, or we'd have to release it
+	   before letting GC proceed. Or we'd have to put ugliness
+	   into the GC code so it didn't attempt to obtain the i_mutex
+	   for the inode(s) which are already locked */
+	struct semaphore sem;
+
+	/* The highest (datanode) version number used for this ino */
+	uint32_t highest_version;
+
+	/* List of data fragments which make up the file */
+	struct rb_root fragtree;
+
+	/* There may be one datanode which isn't referenced by any of the
+	   above fragments, if it contains a metadata update but no actual
+	   data - or if this is a directory inode */
+	/* This also holds the _only_ dnode for symlinks/device nodes,
+	   etc. */
+	struct jffs2_full_dnode *metadata;
+
+	/* Directory entries */
+	struct jffs2_full_dirent *dents;
+
+	/* The target path if this is the inode of a symlink */
+	unsigned char *target;
+
+	/* Some stuff we just have to keep in-core at all times, for each inode. */
+	struct jffs2_inode_cache *inocache;
+
+	uint16_t flags;
+	uint8_t usercompr;
+#if !defined (__ECOS)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2)
+	struct inode vfs_inode;
+#endif
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	struct posix_acl *i_acl_access;
+	struct posix_acl *i_acl_default;
+#endif
+};
+
+#endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
new file mode 100644
index 000000000000..935fec1b1201
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -0,0 +1,133 @@
+/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */
+
+#ifndef _JFFS2_FS_SB
+#define _JFFS2_FS_SB
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <asm/semaphore.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+#define JFFS2_SB_FLAG_RO 1
+#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
+#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
+
+struct jffs2_inodirty;
+
+/* A struct for the overall file system control.  Pointers to
+   jffs2_sb_info structs are named `c' in the source code.
+   Nee jffs_control
+*/
+struct jffs2_sb_info {
+	struct mtd_info *mtd;
+
+	uint32_t highest_ino;
+	uint32_t checked_ino;
+
+	unsigned int flags;
+
+	struct task_struct *gc_task;	/* GC task struct */
+	struct completion gc_thread_start; /* GC thread start completion */
+	struct completion gc_thread_exit; /* GC thread exit completion port */
+
+	struct semaphore alloc_sem;	/* Used to protect all the following
+					   fields, and also to protect against
+					   out-of-order writing of nodes. And GC. */
+	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
+					 (i.e. zero for OOB CLEANMARKER */
+
+	uint32_t flash_size;
+	uint32_t used_size;
+	uint32_t dirty_size;
+	uint32_t wasted_size;
+	uint32_t free_size;
+	uint32_t erasing_size;
+	uint32_t bad_size;
+	uint32_t sector_size;
+	uint32_t unchecked_size;
+
+	uint32_t nr_free_blocks;
+	uint32_t nr_erasing_blocks;
+
+	/* Number of free blocks there must be before we... */
+	uint8_t resv_blocks_write;	/* ... allow a normal filesystem write */
+	uint8_t resv_blocks_deletion;	/* ... allow a normal filesystem deletion */
+	uint8_t resv_blocks_gctrigger;	/* ... wake up the GC thread */
+	uint8_t resv_blocks_gcbad;	/* ... pick a block from the bad_list to GC */
+	uint8_t resv_blocks_gcmerge;	/* ... merge pages when garbage collecting */
+
+	uint32_t nospc_dirty_size;
+
+	uint32_t nr_blocks;
+	struct jffs2_eraseblock *blocks;	/* The whole array of blocks. Used for getting blocks
+						 * from the offset (blocks[ofs / sector_size]) */
+	struct jffs2_eraseblock *nextblock;	/* The block we're currently filling */
+
+	struct jffs2_eraseblock *gcblock;	/* The block we're currently garbage-collecting */
+
+	struct list_head clean_list;		/* Blocks 100% full of clean data */
+	struct list_head very_dirty_list;	/* Blocks with lots of dirty space */
+	struct list_head dirty_list;		/* Blocks with some dirty space */
+	struct list_head erasable_list;		/* Blocks which are completely dirty, and need erasing */
+	struct list_head erasable_pending_wbuf_list;	/* Blocks which need erasing but only after the current wbuf is flushed */
+	struct list_head erasing_list;		/* Blocks which are currently erasing */
+	struct list_head erase_pending_list;	/* Blocks which need erasing now */
+	struct list_head erase_complete_list;	/* Blocks which are erased and need the clean marker written to them */
+	struct list_head free_list;		/* Blocks which are free and ready to be used */
+	struct list_head bad_list;		/* Bad blocks. */
+	struct list_head bad_used_list;		/* Bad blocks with valid data in. */
+
+	spinlock_t erase_completion_lock;	/* Protect free_list and erasing_list
+						   against erase completion handler */
+	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
+
+	wait_queue_head_t inocache_wq;
+	struct jffs2_inode_cache **inocache_list;
+	spinlock_t inocache_lock;
+
+	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
+	   drop the erase_completion_lock while it's holding a pointer
+	   to an obsoleted node. I don't like this. Alternatives welcomed. */
+	struct semaphore erase_free_sem;
+
+	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	/* Write-behind buffer for NAND flash */
+	unsigned char *wbuf;
+	unsigned char *oobbuf;
+	uint32_t wbuf_ofs;
+	uint32_t wbuf_len;
+	struct jffs2_inodirty *wbuf_inodes;
+
+	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+
+	/* Information about out-of-band area usage... */
+	struct nand_ecclayout *ecclayout;
+	uint32_t badblock_pos;
+	uint32_t fsdata_pos;
+	uint32_t fsdata_len;
+#endif
+
+	struct jffs2_summary *summary;		/* Summary information */
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+#define XATTRINDEX_HASHSIZE	(57)
+	uint32_t highest_xid;
+	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
+	struct list_head xattr_unchecked;
+	struct jffs2_xattr_ref *xref_temp;
+	struct rw_semaphore xattr_sem;
+	uint32_t xdatum_mem_usage;
+	uint32_t xdatum_mem_threshold;
+#endif
+	/* OS-private pointer for getting back to master superblock info */
+	void *os_priv;
+};
+
+#endif /* _JFFS2_FB_SB */
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 036cbd11c004..4889d0700c0e 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -26,6 +26,10 @@ static kmem_cache_t *tmp_dnode_info_slab;
 static kmem_cache_t *raw_node_ref_slab;
 static kmem_cache_t *node_frag_slab;
 static kmem_cache_t *inode_cache_slab;
+#ifdef CONFIG_JFFS2_FS_XATTR
+static kmem_cache_t *xattr_datum_cache;
+static kmem_cache_t *xattr_ref_cache;
+#endif
 
 int __init jffs2_create_slab_caches(void)
 {
@@ -53,8 +57,8 @@ int __init jffs2_create_slab_caches(void)
 	if (!tmp_dnode_info_slab)
 		goto err;
 
-	raw_node_ref_slab = kmem_cache_create("jffs2_raw_node_ref",
-					      sizeof(struct jffs2_raw_node_ref),
+	raw_node_ref_slab = kmem_cache_create("jffs2_refblock",
+					      sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1),
 					      0, 0, NULL, NULL);
 	if (!raw_node_ref_slab)
 		goto err;
@@ -68,8 +72,24 @@ int __init jffs2_create_slab_caches(void)
 	inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
 					     sizeof(struct jffs2_inode_cache),
 					     0, 0, NULL, NULL);
-	if (inode_cache_slab)
-		return 0;
+	if (!inode_cache_slab)
+		goto err;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
+					     sizeof(struct jffs2_xattr_datum),
+					     0, 0, NULL, NULL);
+	if (!xattr_datum_cache)
+		goto err;
+
+	xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
+					   sizeof(struct jffs2_xattr_ref),
+					   0, 0, NULL, NULL);
+	if (!xattr_ref_cache)
+		goto err;
+#endif
+
+	return 0;
  err:
 	jffs2_destroy_slab_caches();
 	return -ENOMEM;
@@ -91,6 +111,12 @@ void jffs2_destroy_slab_caches(void)
 		kmem_cache_destroy(node_frag_slab);
 	if(inode_cache_slab)
 		kmem_cache_destroy(inode_cache_slab);
+#ifdef CONFIG_JFFS2_FS_XATTR
+	if (xattr_datum_cache)
+		kmem_cache_destroy(xattr_datum_cache);
+	if (xattr_ref_cache)
+		kmem_cache_destroy(xattr_ref_cache);
+#endif
 }
 
 struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize)
@@ -164,15 +190,65 @@ void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
 	kmem_cache_free(tmp_dnode_info_slab, x);
 }
 
-struct jffs2_raw_node_ref *jffs2_alloc_raw_node_ref(void)
+struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
 {
 	struct jffs2_raw_node_ref *ret;
+
 	ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
-	dbg_memalloc("%p\n", ret);
+	if (ret) {
+		int i = 0;
+		for (i=0; i < REFS_PER_BLOCK; i++) {
+			ret[i].flash_offset = REF_EMPTY_NODE;
+			ret[i].next_in_ino = NULL;
+		}
+		ret[i].flash_offset = REF_LINK_NODE;
+		ret[i].next_in_ino = NULL;
+	}
 	return ret;
 }
 
-void jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *x)
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb, int nr)
+{
+	struct jffs2_raw_node_ref **p, *ref;
+	int i = nr;
+
+	dbg_memalloc("%d\n", nr);
+
+	p = &jeb->last_node;
+	ref = *p;
+
+	dbg_memalloc("Reserving %d refs for block @0x%08x\n", nr, jeb->offset);
+
+	/* If jeb->last_node is really a valid node then skip over it */
+	if (ref && ref->flash_offset != REF_EMPTY_NODE)
+		ref++;
+
+	while (i) {
+		if (!ref) {
+			dbg_memalloc("Allocating new refblock linked from %p\n", p);
+			ref = *p = jffs2_alloc_refblock();
+			if (!ref)
+				return -ENOMEM;
+		}
+		if (ref->flash_offset == REF_LINK_NODE) {
+			p = &ref->next_in_ino;
+			ref = *p;
+			continue;
+		}
+		i--;
+		ref++;
+	}
+	jeb->allocated_refs = nr;
+
+	dbg_memalloc("Reserved %d refs for block @0x%08x, last_node is %p (%08x,%p)\n",
+		  nr, jeb->offset, jeb->last_node, jeb->last_node->flash_offset,
+		  jeb->last_node->next_in_ino);
+
+	return 0;
+}
+
+void jffs2_free_refblock(struct jffs2_raw_node_ref *x)
 {
 	dbg_memalloc("%p\n", x);
 	kmem_cache_free(raw_node_ref_slab, x);
@@ -205,3 +281,40 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 	dbg_memalloc("%p\n", x);
 	kmem_cache_free(inode_cache_slab, x);
 }
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
+{
+	struct jffs2_xattr_datum *xd;
+	xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", xd);
+
+	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
+	xd->class = RAWNODE_CLASS_XATTR_DATUM;
+	INIT_LIST_HEAD(&xd->xindex);
+	return xd;
+}
+
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
+{
+	dbg_memalloc("%p\n", xd);
+	kmem_cache_free(xattr_datum_cache, xd);
+}
+
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
+{
+	struct jffs2_xattr_ref *ref;
+	ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", ref);
+
+	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
+	ref->class = RAWNODE_CLASS_XATTR_REF;
+	return ref;
+}
+
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref)
+{
+	dbg_memalloc("%p\n", ref);
+	kmem_cache_free(xattr_ref_cache, ref);
+}
+#endif
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 1d46677afd17..927dfe42ba76 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -438,8 +438,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 	if (c->mtd->point) {
 		err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
 		if (!err && retlen < tn->csize) {
-			JFFS2_WARNING("MTD point returned len too short: %zu "
-					"instead of %u.\n", retlen, tn->csize);
+			JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
 			c->mtd->unpoint(c->mtd, buffer, ofs, len);
 		} else if (err)
 			JFFS2_WARNING("MTD point failed: error code %d.\n", err);
@@ -462,8 +461,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 		}
 
 		if (retlen != len) {
-			JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n",
-					ofs, retlen, len);
+			JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
 			err = -EIO;
 			goto free_out;
 		}
@@ -940,6 +938,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
 		this = c->inocache_list[i];
 		while (this) {
 			next = this->next;
+			jffs2_xattr_free_inode(c, this);
 			jffs2_free_inode_cache(this);
 			this = next;
 		}
@@ -954,9 +953,13 @@ void jffs2_free_raw_node_refs(struct jffs2_sb_info *c)
 
 	for (i=0; i<c->nr_blocks; i++) {
 		this = c->blocks[i].first_node;
-		while(this) {
-			next = this->next_phys;
-			jffs2_free_raw_node_ref(this);
+		while (this) {
+			if (this[REFS_PER_BLOCK].flash_offset == REF_LINK_NODE)
+				next = this[REFS_PER_BLOCK].next_in_ino;
+			else
+				next = NULL;
+
+			jffs2_free_refblock(this);
 			this = next;
 		}
 		c->blocks[i].first_node = c->blocks[i].last_node = NULL;
@@ -1047,3 +1050,169 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
 		cond_resched();
 	}
 }
+
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic)
+{
+	struct jffs2_raw_node_ref *ref;
+
+	BUG_ON(!jeb->allocated_refs);
+	jeb->allocated_refs--;
+
+	ref = jeb->last_node;
+
+	dbg_noderef("Last node at %p is (%08x,%p)\n", ref, ref->flash_offset,
+		    ref->next_in_ino);
+
+	while (ref->flash_offset != REF_EMPTY_NODE) {
+		if (ref->flash_offset == REF_LINK_NODE)
+			ref = ref->next_in_ino;
+		else
+			ref++;
+	}
+
+	dbg_noderef("New ref is %p (%08x becomes %08x,%p) len 0x%x\n", ref, 
+		    ref->flash_offset, ofs, ref->next_in_ino, len);
+
+	ref->flash_offset = ofs;
+
+	if (!jeb->first_node) {
+		jeb->first_node = ref;
+		BUG_ON(ref_offset(ref) != jeb->offset);
+	} else if (unlikely(ref_offset(ref) != jeb->offset + c->sector_size - jeb->free_size)) {
+		uint32_t last_len = ref_totlen(c, jeb, jeb->last_node);
+
+		JFFS2_ERROR("Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n",
+			    ref, ref_offset(ref), ref_offset(ref)+len,
+			    ref_offset(jeb->last_node), 
+			    ref_offset(jeb->last_node)+last_len);
+		BUG();
+	}
+	jeb->last_node = ref;
+
+	if (ic) {
+		ref->next_in_ino = ic->nodes;
+		ic->nodes = ref;
+	} else {
+		ref->next_in_ino = NULL;
+	}
+
+	switch(ref_flags(ref)) {
+	case REF_UNCHECKED:
+		c->unchecked_size += len;
+		jeb->unchecked_size += len;
+		break;
+
+	case REF_NORMAL:
+	case REF_PRISTINE:
+		c->used_size += len;
+		jeb->used_size += len;
+		break;
+
+	case REF_OBSOLETE:
+		c->dirty_size += len;
+		jeb->dirty_size += len;
+		break;
+	}
+	c->free_size -= len;
+	jeb->free_size -= len;
+
+#ifdef TEST_TOTLEN
+	/* Set (and test) __totlen field... for now */
+	ref->__totlen = len;
+	ref_totlen(c, jeb, ref);
+#endif
+	return ref;
+}
+
+/* No locking, no reservation of 'ref'. Do not use on a live file system */
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   uint32_t size)
+{
+	if (!size)
+		return 0;
+	if (unlikely(size > jeb->free_size)) {
+		printk(KERN_CRIT "Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n",
+		       size, jeb->free_size, jeb->wasted_size);
+		BUG();
+	}
+	/* REF_EMPTY_NODE is !obsolete, so that works OK */
+	if (jeb->last_node && ref_obsolete(jeb->last_node)) {
+#ifdef TEST_TOTLEN
+		jeb->last_node->__totlen += size;
+#endif
+		c->dirty_size += size;
+		c->free_size -= size;
+		jeb->dirty_size += size;
+		jeb->free_size -= size;
+	} else {
+		uint32_t ofs = jeb->offset + c->sector_size - jeb->free_size;
+		ofs |= REF_OBSOLETE;
+
+		jffs2_link_node_ref(c, jeb, ofs, size, NULL);
+	}
+
+	return 0;
+}
+
+/* Calculate totlen from surrounding nodes or eraseblock */
+static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
+				    struct jffs2_eraseblock *jeb,
+				    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ref_end;
+	struct jffs2_raw_node_ref *next_ref = ref_next(ref);
+
+	if (next_ref)
+		ref_end = ref_offset(next_ref);
+	else {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
+		/* Last node in block. Use free_space */
+		if (unlikely(ref != jeb->last_node)) {
+			printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
+			       ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0);
+			BUG();
+		}
+		ref_end = jeb->offset + c->sector_size - jeb->free_size;
+	}
+	return ref_end - ref_offset(ref);
+}
+
+uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ret;
+
+	ret = __ref_totlen(c, jeb, ref);
+
+#ifdef TEST_TOTLEN
+	if (unlikely(ret != ref->__totlen)) {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
+		printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
+		       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
+		       ret, ref->__totlen);
+		if (ref_next(ref)) {
+			printk(KERN_CRIT "next %p (0x%08x-0x%08x)\n", ref_next(ref), ref_offset(ref_next(ref)),
+			       ref_offset(ref_next(ref))+ref->__totlen);
+		} else 
+			printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node);
+
+		printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size);
+
+#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
+		__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+#endif
+
+		WARN_ON(1);
+
+		ret = ref->__totlen;
+	}
+#endif /* TEST_TOTLEN */
+	return ret;
+}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 23a67bb3052f..b16c60bbcf6e 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -18,8 +18,10 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_sb.h>
-#include <linux/jffs2_fs_i.h>
+#include "jffs2_fs_sb.h"
+#include "jffs2_fs_i.h"
+#include "xattr.h"
+#include "acl.h"
 #include "summary.h"
 
 #ifdef __ECOS
@@ -75,14 +77,50 @@
 struct jffs2_raw_node_ref
 {
 	struct jffs2_raw_node_ref *next_in_ino; /* Points to the next raw_node_ref
-		for this inode. If this is the last, it points to the inode_cache
-		for this inode instead. The inode_cache will have NULL in the first
-		word so you know when you've got there :) */
-	struct jffs2_raw_node_ref *next_phys;
+		for this object. If this _is_ the last, it points to the inode_cache,
+		xattr_ref or xattr_datum instead. The common part of those structures
+		has NULL in the first word. See jffs2_raw_ref_to_ic() below */
 	uint32_t flash_offset;
+#define TEST_TOTLEN
+#ifdef TEST_TOTLEN
 	uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */
+#endif
 };
 
+#define REF_LINK_NODE ((int32_t)-1)
+#define REF_EMPTY_NODE ((int32_t)-2)
+
+/* Use blocks of about 256 bytes */
+#define REFS_PER_BLOCK ((255/sizeof(struct jffs2_raw_node_ref))-1)
+
+static inline struct jffs2_raw_node_ref *ref_next(struct jffs2_raw_node_ref *ref)
+{
+	ref++;
+
+	/* Link to another block of refs */
+	if (ref->flash_offset == REF_LINK_NODE) {
+		ref = ref->next_in_ino;
+		if (!ref)
+			return ref;
+	}
+
+	/* End of chain */
+	if (ref->flash_offset == REF_EMPTY_NODE)
+		return NULL;
+
+	return ref;
+}
+
+static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
+{
+	while(raw->next_in_ino)
+		raw = raw->next_in_ino;
+
+	/* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and
+	   not actually a jffs2_inode_cache. Check ->class */
+	return ((struct jffs2_inode_cache *)raw);
+}
+
         /* flash_offset & 3 always has to be zero, because nodes are
 	   always aligned at 4 bytes. So we have a couple of extra bits
 	   to play with, which indicate the node's status; see below: */
@@ -95,6 +133,11 @@ struct jffs2_raw_node_ref
 #define ref_obsolete(ref)	(((ref)->flash_offset & 3) == REF_OBSOLETE)
 #define mark_ref_normal(ref)    do { (ref)->flash_offset = ref_offset(ref) | REF_NORMAL; } while(0)
 
+/* NB: REF_PRISTINE for an inode-less node (ref->next_in_ino == NULL) indicates
+   it is an unknown node of type JFFS2_NODETYPE_RWCOMPAT_COPY, so it'll get
+   copied. If you need to do anything different to GC inode-less nodes, then
+   you need to modify gc.c accordingly. */
+
 /* For each inode in the filesystem, we need to keep a record of
    nlink, because it would be a PITA to scan the whole directory tree
    at read_inode() time to calculate it, and to keep sufficient information
@@ -103,15 +146,27 @@ struct jffs2_raw_node_ref
    a pointer to the first physical node which is part of this inode, too.
 */
 struct jffs2_inode_cache {
+	/* First part of structure is shared with other objects which
+	   can terminate the raw node refs' next_in_ino list -- which
+	   currently struct jffs2_xattr_datum and struct jffs2_xattr_ref. */
+
 	struct jffs2_full_dirent *scan_dents; /* Used during scan to hold
 		temporary lists of dirents, and later must be set to
 		NULL to mark the end of the raw_node_ref->next_in_ino
 		chain. */
-	struct jffs2_inode_cache *next;
 	struct jffs2_raw_node_ref *nodes;
+	uint8_t class;	/* It's used for identification */
+
+	/* end of shared structure */
+
+	uint8_t flags;
+	uint16_t state;
 	uint32_t ino;
+	struct jffs2_inode_cache *next;
+#ifdef CONFIG_JFFS2_FS_XATTR
+	struct jffs2_xattr_ref *xref;
+#endif
 	int nlink;
-	int state;
 };
 
 /* Inode states for 'state' above. We need the 'GC' state to prevent
@@ -125,8 +180,16 @@ struct jffs2_inode_cache {
 #define INO_STATE_READING	5	/* In read_inode() */
 #define INO_STATE_CLEARING	6	/* In clear_inode() */
 
+#define INO_FLAGS_XATTR_CHECKED	0x01	/* has no duplicate xattr_ref */
+
+#define RAWNODE_CLASS_INODE_CACHE	0
+#define RAWNODE_CLASS_XATTR_DATUM	1
+#define RAWNODE_CLASS_XATTR_REF		2
+
 #define INOCACHE_HASHSIZE 128
 
+#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
+
 /*
   Larger representation of a raw node, kept in-core only when the
   struct inode for this particular ino is instantiated.
@@ -192,6 +255,7 @@ struct jffs2_eraseblock
 	uint32_t wasted_size;
 	uint32_t free_size;	/* Note that sector_size - free_size
 				   is the address of the first free space */
+	uint32_t allocated_refs;
 	struct jffs2_raw_node_ref *first_node;
 	struct jffs2_raw_node_ref *last_node;
 
@@ -203,57 +267,7 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
 	return ((c->flash_size / c->sector_size) * sizeof (struct jffs2_eraseblock)) > (128 * 1024);
 }
 
-/* Calculate totlen from surrounding nodes or eraseblock */
-static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
-				    struct jffs2_eraseblock *jeb,
-				    struct jffs2_raw_node_ref *ref)
-{
-	uint32_t ref_end;
-
-	if (ref->next_phys)
-		ref_end = ref_offset(ref->next_phys);
-	else {
-		if (!jeb)
-			jeb = &c->blocks[ref->flash_offset / c->sector_size];
-
-		/* Last node in block. Use free_space */
-		BUG_ON(ref != jeb->last_node);
-		ref_end = jeb->offset + c->sector_size - jeb->free_size;
-	}
-	return ref_end - ref_offset(ref);
-}
-
-static inline uint32_t ref_totlen(struct jffs2_sb_info *c,
-				  struct jffs2_eraseblock *jeb,
-				  struct jffs2_raw_node_ref *ref)
-{
-	uint32_t ret;
-
-#if CONFIG_JFFS2_FS_DEBUG > 0
-	if (jeb && jeb != &c->blocks[ref->flash_offset / c->sector_size]) {
-		printk(KERN_CRIT "ref_totlen called with wrong block -- at 0x%08x instead of 0x%08x; ref 0x%08x\n",
-		       jeb->offset, c->blocks[ref->flash_offset / c->sector_size].offset, ref_offset(ref));
-		BUG();
-	}
-#endif
-
-#if 1
-	ret = ref->__totlen;
-#else
-	/* This doesn't actually work yet */
-	ret = __ref_totlen(c, jeb, ref);
-	if (ret != ref->__totlen) {
-		printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
-		       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
-		       ret, ref->__totlen);
-		if (!jeb)
-			jeb = &c->blocks[ref->flash_offset / c->sector_size];
-		jffs2_dbg_dump_node_refs_nolock(c, jeb);
-		BUG();
-	}
-#endif
-	return ret;
-}
+#define ref_totlen(a, b, c) __jffs2_ref_totlen((a), (b), (c))
 
 #define ALLOC_NORMAL	0	/* Normal allocation */
 #define ALLOC_DELETION	1	/* Deletion node. Best to allow it */
@@ -268,13 +282,15 @@ static inline uint32_t ref_totlen(struct jffs2_sb_info *c,
 
 #define PAD(x) (((x)+3)&~3)
 
-static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
+static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 {
-	while(raw->next_in_ino) {
-		raw = raw->next_in_ino;
+	if (old_valid_dev(rdev)) {
+		jdev->old = cpu_to_je16(old_encode_dev(rdev));
+		return sizeof(jdev->old);
+	} else {
+		jdev->new = cpu_to_je32(new_encode_dev(rdev));
+		return sizeof(jdev->new);
 	}
-
-	return ((struct jffs2_inode_cache *)raw);
 }
 
 static inline struct jffs2_node_frag *frag_first(struct rb_root *root)
@@ -299,7 +315,6 @@ static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
 	return rb_entry(node, struct jffs2_node_frag, rb);
 }
 
-#define rb_parent(rb) ((rb)->rb_parent)
 #define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb)
 #define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb)
 #define frag_parent(frag) rb_entry(rb_parent(&(frag)->rb), struct jffs2_node_frag, rb)
@@ -324,28 +339,44 @@ void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, struct jffs2_node_frag *t
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic);
+extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
+				   struct jffs2_eraseblock *jeb,
+				   struct jffs2_raw_node_ref *ref);
 
 /* nodemgmt.c */
 int jffs2_thread_should_wake(struct jffs2_sb_info *c);
-int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, int prio, uint32_t sumsize);
-int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, uint32_t sumsize);
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new);
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic);
 void jffs2_complete_reservation(struct jffs2_sb_info *c);
 void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw);
 
 /* write.c */
 int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri);
 
-struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const unsigned char *data, uint32_t datalen, uint32_t flash_ofs, int alloc_mode);
-struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_dirent *rd, const unsigned char *name, uint32_t namelen, uint32_t flash_ofs, int alloc_mode);
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode);
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode);
 int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			    struct jffs2_raw_inode *ri, unsigned char *buf,
 			    uint32_t offset, uint32_t writelen, uint32_t *retlen);
-int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen);
-int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
-int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, uint8_t type, const char *name, int namelen, uint32_t time);
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
+		    struct jffs2_raw_inode *ri, const char *name, int namelen);
+int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
+		    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
+int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
+		   uint8_t type, const char *name, int namelen, uint32_t time);
 
 
 /* readinode.c */
@@ -368,12 +399,19 @@ struct jffs2_raw_inode *jffs2_alloc_raw_inode(void);
 void jffs2_free_raw_inode(struct jffs2_raw_inode *);
 struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void);
 void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *);
-struct jffs2_raw_node_ref *jffs2_alloc_raw_node_ref(void);
-void jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *);
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb, int nr);
+void jffs2_free_refblock(struct jffs2_raw_node_ref *);
 struct jffs2_node_frag *jffs2_alloc_node_frag(void);
 void jffs2_free_node_frag(struct jffs2_node_frag *);
 struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
 void jffs2_free_inode_cache(struct jffs2_inode_cache *);
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void);
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *);
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void);
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *);
+#endif
 
 /* gc.c */
 int jffs2_garbage_collect_pass(struct jffs2_sb_info *c);
@@ -393,12 +431,14 @@ int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
 				uint32_t ofs, uint32_t len);
 struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
 int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size);
 
 /* build.c */
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 
 /* erase.c */
 void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 /* wbuf.c */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 49127a1f0458..8bedfd2ff689 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -23,13 +23,12 @@
  *	jffs2_reserve_space - request physical space to write nodes to flash
  *	@c: superblock info
  *	@minsize: Minimum acceptable size of allocation
- *	@ofs: Returned value of node offset
  *	@len: Returned value of allocation length
  *	@prio: Allocation type - ALLOC_{NORMAL,DELETION}
  *
  *	Requests a block of physical space on the flash. Returns zero for success
- *	and puts 'ofs' and 'len' into the appriopriate place, or returns -ENOSPC
- *	or other error if appropriate.
+ *	and puts 'len' into the appropriate place, or returns -ENOSPC or other 
+ *	error if appropriate. Doesn't return len since that's 
  *
  *	If it returns zero, jffs2_reserve_space() also downs the per-filesystem
  *	allocation semaphore, to prevent more than one allocation from being
@@ -40,9 +39,9 @@
  */
 
 static int jffs2_do_reserve_space(struct jffs2_sb_info *c,  uint32_t minsize,
-					uint32_t *ofs, uint32_t *len, uint32_t sumsize);
+				  uint32_t *len, uint32_t sumsize);
 
-int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, int prio, uint32_t sumsize)
 {
 	int ret = -EAGAIN;
@@ -132,19 +131,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs
 			spin_lock(&c->erase_completion_lock);
 		}
 
-		ret = jffs2_do_reserve_space(c, minsize, ofs, len, sumsize);
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
 		if (ret) {
 			D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret));
 		}
 	}
 	spin_unlock(&c->erase_completion_lock);
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
 	if (ret)
 		up(&c->alloc_sem);
 	return ret;
 }
 
-int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
-			uint32_t *len, uint32_t sumsize)
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
+			   uint32_t *len, uint32_t sumsize)
 {
 	int ret = -EAGAIN;
 	minsize = PAD(minsize);
@@ -153,12 +154,15 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *
 
 	spin_lock(&c->erase_completion_lock);
 	while(ret == -EAGAIN) {
-		ret = jffs2_do_reserve_space(c, minsize, ofs, len, sumsize);
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
 		if (ret) {
 		        D1(printk(KERN_DEBUG "jffs2_reserve_space_gc: looping, ret is %d\n", ret));
 		}
 	}
 	spin_unlock(&c->erase_completion_lock);
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+
 	return ret;
 }
 
@@ -259,10 +263,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 }
 
 /* Called with alloc sem _and_ erase_completion_lock */
-static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs, uint32_t *len, uint32_t sumsize)
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+				  uint32_t *len, uint32_t sumsize)
 {
 	struct jffs2_eraseblock *jeb = c->nextblock;
-	uint32_t reserved_size; 			/* for summary information at the end of the jeb */
+	uint32_t reserved_size;				/* for summary information at the end of the jeb */
 	int ret;
 
  restart:
@@ -312,6 +317,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 		}
 	} else {
 		if (jeb && minsize > jeb->free_size) {
+			uint32_t waste;
+
 			/* Skip the end of this block and file it as having some dirty space */
 			/* If there's a pending write to it, flush now */
 
@@ -324,10 +331,26 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 				goto restart;
 			}
 
-			c->wasted_size += jeb->free_size;
-			c->free_size -= jeb->free_size;
-			jeb->wasted_size += jeb->free_size;
-			jeb->free_size = 0;
+			spin_unlock(&c->erase_completion_lock);
+
+			ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+			if (ret)
+				return ret;
+			/* Just lock it again and continue. Nothing much can change because
+			   we hold c->alloc_sem anyway. In fact, it's not entirely clear why
+			   we hold c->erase_completion_lock in the majority of this function...
+			   but that's a question for another (more caffeine-rich) day. */
+			spin_lock(&c->erase_completion_lock);
+
+			waste = jeb->free_size;
+			jffs2_link_node_ref(c, jeb,
+					    (jeb->offset + c->sector_size - waste) | REF_OBSOLETE,
+					    waste, NULL);
+			/* FIXME: that made it count as dirty. Convert to wasted */
+			jeb->dirty_size -= waste;
+			c->dirty_size -= waste;
+			jeb->wasted_size += waste;
+			c->wasted_size += waste;
 
 			jffs2_close_nextblock(c, jeb);
 			jeb = NULL;
@@ -349,7 +372,6 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 	}
 	/* OK, jeb (==c->nextblock) is now pointing at a block which definitely has
 	   enough space */
-	*ofs = jeb->offset + (c->sector_size - jeb->free_size);
 	*len = jeb->free_size - reserved_size;
 
 	if (c->cleanmarker_size && jeb->used_size == c->cleanmarker_size &&
@@ -365,7 +387,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 		spin_lock(&c->erase_completion_lock);
 	}
 
-	D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n", *len, *ofs));
+	D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n",
+		  *len, jeb->offset + (c->sector_size - jeb->free_size)));
 	return 0;
 }
 
@@ -374,7 +397,6 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
  *	@c: superblock info
  *	@new: new node reference to add
  *	@len: length of this physical node
- *	@dirty: dirty flag for new node
  *
  *	Should only be used to report nodes for which space has been allocated
  *	by jffs2_reserve_space.
@@ -382,42 +404,30 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
  *	Must be called with the alloc_sem held.
  */
 
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new)
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic)
 {
 	struct jffs2_eraseblock *jeb;
-	uint32_t len;
+	struct jffs2_raw_node_ref *new;
 
-	jeb = &c->blocks[new->flash_offset / c->sector_size];
-	len = ref_totlen(c, jeb, new);
+	jeb = &c->blocks[ofs / c->sector_size];
 
-	D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", ref_offset(new), ref_flags(new), len));
+	D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n",
+		  ofs & ~3, ofs & 3, len));
 #if 1
-	/* we could get some obsolete nodes after nextblock was refiled
-	   in wbuf.c */
-	if ((c->nextblock || !ref_obsolete(new))
-	    &&(jeb != c->nextblock || ref_offset(new) != jeb->offset + (c->sector_size - jeb->free_size))) {
+	/* Allow non-obsolete nodes only to be added at the end of c->nextblock, 
+	   if c->nextblock is set. Note that wbuf.c will file obsolete nodes
+	   even after refiling c->nextblock */
+	if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE))
+	    && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) {
 		printk(KERN_WARNING "argh. node added in wrong place\n");
-		jffs2_free_raw_node_ref(new);
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	}
 #endif
 	spin_lock(&c->erase_completion_lock);
 
-	if (!jeb->first_node)
-		jeb->first_node = new;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = new;
-	jeb->last_node = new;
-
-	jeb->free_size -= len;
-	c->free_size -= len;
-	if (ref_obsolete(new)) {
-		jeb->dirty_size += len;
-		c->dirty_size += len;
-	} else {
-		jeb->used_size += len;
-		c->used_size += len;
-	}
+	new = jffs2_link_node_ref(c, jeb, ofs, len, ic);
 
 	if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
 		/* If it lives on the dirty_list, jffs2_reserve_space will put it there */
@@ -438,7 +448,7 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 
 	spin_unlock(&c->erase_completion_lock);
 
-	return 0;
+	return new;
 }
 
 
@@ -470,8 +480,9 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	struct jffs2_unknown_node n;
 	int ret, addedsize;
 	size_t retlen;
+	uint32_t freed_len;
 
-	if(!ref) {
+	if(unlikely(!ref)) {
 		printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
 		return;
 	}
@@ -499,32 +510,34 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	spin_lock(&c->erase_completion_lock);
 
+	freed_len = ref_totlen(c, jeb, ref);
+
 	if (ref_flags(ref) == REF_UNCHECKED) {
-		D1(if (unlikely(jeb->unchecked_size < ref_totlen(c, jeb, ref))) {
+		D1(if (unlikely(jeb->unchecked_size < freed_len)) {
 			printk(KERN_NOTICE "raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n",
-			       ref_totlen(c, jeb, ref), blocknr, ref->flash_offset, jeb->used_size);
+			       freed_len, blocknr, ref->flash_offset, jeb->used_size);
 			BUG();
 		})
-		D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), ref_totlen(c, jeb, ref)));
-		jeb->unchecked_size -= ref_totlen(c, jeb, ref);
-		c->unchecked_size -= ref_totlen(c, jeb, ref);
+		D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), freed_len));
+		jeb->unchecked_size -= freed_len;
+		c->unchecked_size -= freed_len;
 	} else {
-		D1(if (unlikely(jeb->used_size < ref_totlen(c, jeb, ref))) {
+		D1(if (unlikely(jeb->used_size < freed_len)) {
 			printk(KERN_NOTICE "raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n",
-			       ref_totlen(c, jeb, ref), blocknr, ref->flash_offset, jeb->used_size);
+			       freed_len, blocknr, ref->flash_offset, jeb->used_size);
 			BUG();
 		})
-		D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), ref_totlen(c, jeb, ref)));
-		jeb->used_size -= ref_totlen(c, jeb, ref);
-		c->used_size -= ref_totlen(c, jeb, ref);
+		D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), freed_len));
+		jeb->used_size -= freed_len;
+		c->used_size -= freed_len;
 	}
 
 	// Take care, that wasted size is taken into concern
-	if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + ref_totlen(c, jeb, ref))) && jeb != c->nextblock) {
-		D1(printk(KERN_DEBUG "Dirtying\n"));
-		addedsize = ref_totlen(c, jeb, ref);
-		jeb->dirty_size += ref_totlen(c, jeb, ref);
-		c->dirty_size += ref_totlen(c, jeb, ref);
+	if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) {
+		D1(printk("Dirtying\n"));
+		addedsize = freed_len;
+		jeb->dirty_size += freed_len;
+		c->dirty_size += freed_len;
 
 		/* Convert wasted space to dirty, if not a bad block */
 		if (jeb->wasted_size) {
@@ -543,10 +556,10 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 			}
 		}
 	} else {
-		D1(printk(KERN_DEBUG "Wasting\n"));
+		D1(printk("Wasting\n"));
 		addedsize = 0;
-		jeb->wasted_size += ref_totlen(c, jeb, ref);
-		c->wasted_size += ref_totlen(c, jeb, ref);
+		jeb->wasted_size += freed_len;
+		c->wasted_size += freed_len;
 	}
 	ref->flash_offset = ref_offset(ref) | REF_OBSOLETE;
 
@@ -622,7 +635,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	/* The erase_free_sem is locked, and has been since before we marked the node obsolete
 	   and potentially put its eraseblock onto the erase_pending_list. Thus, we know that
 	   the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet
-	   by jffs2_free_all_node_refs() in erase.c. Which is nice. */
+	   by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */
 
 	D1(printk(KERN_DEBUG "obliterating obsoleted node at 0x%08x\n", ref_offset(ref)));
 	ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
@@ -634,8 +647,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		printk(KERN_WARNING "Short read from obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen);
 		goto out_erase_sem;
 	}
-	if (PAD(je32_to_cpu(n.totlen)) != PAD(ref_totlen(c, jeb, ref))) {
-		printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), ref_totlen(c, jeb, ref));
+	if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) {
+		printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), freed_len);
 		goto out_erase_sem;
 	}
 	if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) {
@@ -671,6 +684,10 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		spin_lock(&c->erase_completion_lock);
 
 		ic = jffs2_raw_ref_to_ic(ref);
+		/* It seems we should never call jffs2_mark_node_obsolete() for
+		   XATTR nodes.... yet. Make sure we notice if/when we change
+		   that :) */
+		BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 		for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino))
 			;
 
@@ -683,51 +700,6 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		spin_unlock(&c->erase_completion_lock);
 	}
 
-
-	/* Merge with the next node in the physical list, if there is one
-	   and if it's also obsolete and if it doesn't belong to any inode */
-	if (ref->next_phys && ref_obsolete(ref->next_phys) &&
-	    !ref->next_phys->next_in_ino) {
-		struct jffs2_raw_node_ref *n = ref->next_phys;
-
-		spin_lock(&c->erase_completion_lock);
-
-		ref->__totlen += n->__totlen;
-		ref->next_phys = n->next_phys;
-                if (jeb->last_node == n) jeb->last_node = ref;
-		if (jeb->gc_node == n) {
-			/* gc will be happy continuing gc on this node */
-			jeb->gc_node=ref;
-		}
-		spin_unlock(&c->erase_completion_lock);
-
-		jffs2_free_raw_node_ref(n);
-	}
-
-	/* Also merge with the previous node in the list, if there is one
-	   and that one is obsolete */
-	if (ref != jeb->first_node ) {
-		struct jffs2_raw_node_ref *p = jeb->first_node;
-
-		spin_lock(&c->erase_completion_lock);
-
-		while (p->next_phys != ref)
-			p = p->next_phys;
-
-		if (ref_obsolete(p) && !ref->next_in_ino) {
-			p->__totlen += ref->__totlen;
-			if (jeb->last_node == ref) {
-				jeb->last_node = p;
-			}
-			if (jeb->gc_node == ref) {
-				/* gc will be happy continuing gc on this node */
-				jeb->gc_node=p;
-			}
-			p->next_phys = ref->next_phys;
-			jffs2_free_raw_node_ref(ref);
-		}
-		spin_unlock(&c->erase_completion_lock);
-	}
  out_erase_sem:
 	up(&c->erase_free_sem);
 }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d307cf548625..6b5223565405 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -31,9 +31,7 @@ struct kvec;
 #define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode)
 #define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid)
 #define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid)
-
-#define JFFS2_F_I_RDEV_MIN(f) (iminor(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_RDEV_MAJ(f) (imajor(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev)
 
 #define ITIME(sec) ((struct timespec){sec, 0})
 #define I_SEC(tv) ((tv).tv_sec)
@@ -60,6 +58,10 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 	f->target = NULL;
 	f->flags = 0;
 	f->usercompr = 0;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+#endif
 }
 
 
@@ -90,13 +92,10 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_flash_writev(a,b,c,d,e,f) jffs2_flash_direct_writev(a,b,c,d,e)
 #define jffs2_wbuf_timeout NULL
 #define jffs2_wbuf_process NULL
-#define jffs2_nor_ecc(c) (0)
 #define jffs2_dataflash(c) (0)
-#define jffs2_nor_wbuf_flash(c) (0)
-#define jffs2_nor_ecc_flash_setup(c) (0)
-#define jffs2_nor_ecc_flash_cleanup(c) do {} while (0)
 #define jffs2_dataflash_setup(c) (0)
 #define jffs2_dataflash_cleanup(c) do {} while (0)
+#define jffs2_nor_wbuf_flash(c) (0)
 #define jffs2_nor_wbuf_flash_setup(c) (0)
 #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
 
@@ -107,9 +106,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #ifdef CONFIG_JFFS2_SUMMARY
 #define jffs2_can_mark_obsolete(c) (0)
 #else
-#define jffs2_can_mark_obsolete(c) \
-  ((c->mtd->type == MTD_NORFLASH && !(c->mtd->flags & (MTD_ECC|MTD_PROGRAM_REGIONS))) || \
-   c->mtd->type == MTD_RAM)
+#define jffs2_can_mark_obsolete(c) (c->mtd->flags & (MTD_BIT_WRITEABLE))
 #endif
 
 #define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH)
@@ -133,15 +130,11 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
 int jffs2_nand_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
 
-#define jffs2_nor_ecc(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_ECC))
-int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c);
-void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c);
-
 #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
 
-#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_PROGRAM_REGIONS))
+#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
 
@@ -182,7 +175,7 @@ void jffs2_clear_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
-int jffs2_statfs (struct super_block *, struct kstatfs *);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
 void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index f1695642d0f7..5fec012b02ed 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -66,7 +66,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
 			jffs2_free_full_dnode(tn->fn);
 			jffs2_free_tmp_dnode_info(tn);
 
-			this = this->rb_parent;
+			this = rb_parent(this);
 			if (!this)
 				break;
 
@@ -116,19 +116,42 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 				uint32_t *latest_mctime, uint32_t *mctime_ver)
 {
 	struct jffs2_full_dirent *fd;
+	uint32_t crc;
 
-	/* The direntry nodes are checked during the flash scanning */
-	BUG_ON(ref_flags(ref) == REF_UNCHECKED);
 	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
 	BUG_ON(ref_obsolete(ref));
 
-	/* Sanity check */
-	if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
-		JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
-		       ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
 		return 1;
 	}
 
+	/* If we've never checked the CRCs on this node, check them now */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		struct jffs2_eraseblock *jeb;
+		int len;
+
+		/* Sanity check */
+		if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
+			JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
+				    ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+			return 1;
+		}
+
+		jeb = &c->blocks[ref->flash_offset / c->sector_size];
+		len = ref_totlen(c, jeb, ref);
+
+		spin_lock(&c->erase_completion_lock);
+		jeb->used_size += len;
+		jeb->unchecked_size -= len;
+		c->used_size += len;
+		c->unchecked_size -= len;
+		ref->flash_offset = ref_offset(ref) | REF_PRISTINE;
+		spin_unlock(&c->erase_completion_lock);
+	}
+
 	fd = jffs2_alloc_full_dirent(rd->nsize + 1);
 	if (unlikely(!fd))
 		return -ENOMEM;
@@ -198,13 +221,21 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	struct jffs2_tmp_dnode_info *tn;
 	uint32_t len, csize;
 	int ret = 1;
+	uint32_t crc;
 
 	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
 	BUG_ON(ref_obsolete(ref));
 
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
+		return 1;
+	}
+
 	tn = jffs2_alloc_tmp_dnode_info();
 	if (!tn) {
-		JFFS2_ERROR("failed to allocate tn (%d bytes).\n", sizeof(*tn));
+		JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn));
 		return -ENOMEM;
 	}
 
@@ -213,14 +244,6 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	/* If we've never checked the CRCs on this node, check them now */
 	if (ref_flags(ref) == REF_UNCHECKED) {
-		uint32_t crc;
-
-		crc = crc32(0, rd, sizeof(*rd) - 8);
-		if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
-			JFFS2_NOTICE("header CRC failed on node at %#08x: read %#08x, calculated %#08x\n",
-					ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
-			goto free_out;
-		}
 
 		/* Sanity checks */
 		if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) ||
@@ -343,7 +366,7 @@ free_out:
  * Helper function for jffs2_get_inode_nodes().
  * It is called every time an unknown node is found.
  *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
  * 	    1 if the node should be marked obsolete;
  * 	    negative error code on failure.
  */
@@ -354,37 +377,30 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
 
 	un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
 
-	if (crc32(0, un, sizeof(struct jffs2_unknown_node) - 4) != je32_to_cpu(un->hdr_crc)) {
-		/* Hmmm. This should have been caught at scan time. */
-		JFFS2_NOTICE("node header CRC failed at %#08x. But it must have been OK earlier.\n", ref_offset(ref));
-		jffs2_dbg_dump_node(c, ref_offset(ref));
-		return 1;
-	} else {
-		switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
+	switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
 
-		case JFFS2_FEATURE_INCOMPAT:
-			JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
-				je16_to_cpu(un->nodetype), ref_offset(ref));
-			/* EEP */
-			BUG();
-			break;
+	case JFFS2_FEATURE_INCOMPAT:
+		JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		/* EEP */
+		BUG();
+		break;
 
-		case JFFS2_FEATURE_ROCOMPAT:
-			JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
-					je16_to_cpu(un->nodetype), ref_offset(ref));
-			BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
-			break;
+	case JFFS2_FEATURE_ROCOMPAT:
+		JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
+		break;
 
-		case JFFS2_FEATURE_RWCOMPAT_COPY:
-			JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
-					je16_to_cpu(un->nodetype), ref_offset(ref));
-			break;
+	case JFFS2_FEATURE_RWCOMPAT_COPY:
+		JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		break;
 
-		case JFFS2_FEATURE_RWCOMPAT_DELETE:
-			JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
-					je16_to_cpu(un->nodetype), ref_offset(ref));
-			return 1;
-		}
+	case JFFS2_FEATURE_RWCOMPAT_DELETE:
+		JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		return 1;
 	}
 
 	return 0;
@@ -434,7 +450,7 @@ static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
 	}
 
 	if (retlen < len) {
-		JFFS2_ERROR("short read at %#08x: %d instead of %d.\n",
+		JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
 				offs, retlen, len);
 		return -EIO;
 	}
@@ -542,13 +558,25 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
 		}
 
 		if (retlen < len) {
-			JFFS2_ERROR("short read at %#08x: %d instead of %d.\n", ref_offset(ref), retlen, len);
+			JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len);
 			err = -EIO;
 			goto free_out;
 		}
 
 		node = (union jffs2_node_union *)bufstart;
 
+		/* No need to mask in the valid bit; it shouldn't be invalid */
+		if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
+			JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n",
+				     ref_offset(ref), je16_to_cpu(node->u.magic),
+				     je16_to_cpu(node->u.nodetype),
+				     je32_to_cpu(node->u.totlen),
+				     je32_to_cpu(node->u.hdr_crc));
+			jffs2_dbg_dump_node(c, ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto cont;
+		}
+
 		switch (je16_to_cpu(node->u.nodetype)) {
 
 		case JFFS2_NODETYPE_DIRENT:
@@ -606,6 +634,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
 				goto free_out;
 
 		}
+	cont:
 		spin_lock(&c->erase_completion_lock);
 	}
 
@@ -679,12 +708,12 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 			jffs2_mark_node_obsolete(c, fn->raw);
 
 		BUG_ON(rb->rb_left);
-		if (rb->rb_parent && rb->rb_parent->rb_left == rb) {
+		if (rb_parent(rb) && rb_parent(rb)->rb_left == rb) {
 			/* We were then left-hand child of our parent. We need
 			 * to move our own right-hand child into our place. */
 			repl_rb = rb->rb_right;
 			if (repl_rb)
-				repl_rb->rb_parent = rb->rb_parent;
+				rb_set_parent(repl_rb, rb_parent(rb));
 		} else
 			repl_rb = NULL;
 
@@ -692,14 +721,14 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 
 		/* Remove the spent tn from the tree; don't bother rebalancing
 		 * but put our right-hand child in our own place. */
-		if (tn->rb.rb_parent) {
-			if (tn->rb.rb_parent->rb_left == &tn->rb)
-				tn->rb.rb_parent->rb_left = repl_rb;
-			else if (tn->rb.rb_parent->rb_right == &tn->rb)
-				tn->rb.rb_parent->rb_right = repl_rb;
+		if (rb_parent(&tn->rb)) {
+			if (rb_parent(&tn->rb)->rb_left == &tn->rb)
+				rb_parent(&tn->rb)->rb_left = repl_rb;
+			else if (rb_parent(&tn->rb)->rb_right == &tn->rb)
+				rb_parent(&tn->rb)->rb_right = repl_rb;
 			else BUG();
 		} else if (tn->rb.rb_right)
-			tn->rb.rb_right->rb_parent = NULL;
+			rb_set_parent(tn->rb.rb_right, NULL);
 
 		jffs2_free_tmp_dnode_info(tn);
 		if (ret) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index cf55b221fc2b..61618080b86f 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -65,6 +65,28 @@ static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) {
 		return DEFAULT_EMPTY_SCAN_SIZE;
 }
 
+static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	int ret;
+
+	if ((ret = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+		return ret;
+	if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size)))
+		return ret;
+	/* Turned wasted size into dirty, since we apparently 
+	   think it's recoverable now. */
+	jeb->dirty_size += jeb->wasted_size;
+	c->dirty_size += jeb->wasted_size;
+	c->wasted_size -= jeb->wasted_size;
+	jeb->wasted_size = 0;
+	if (VERYDIRTY(c, jeb->dirty_size)) {
+		list_add(&jeb->list, &c->very_dirty_list);
+	} else {
+		list_add(&jeb->list, &c->dirty_list);
+	}
+	return 0;
+}
+
 int jffs2_scan_medium(struct jffs2_sb_info *c)
 {
 	int i, ret;
@@ -170,34 +192,20 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 					(!c->nextblock || c->nextblock->free_size < jeb->free_size)) {
 				/* Better candidate for the next writes to go to */
 				if (c->nextblock) {
-					c->nextblock->dirty_size += c->nextblock->free_size + c->nextblock->wasted_size;
-					c->dirty_size += c->nextblock->free_size + c->nextblock->wasted_size;
-					c->free_size -= c->nextblock->free_size;
-					c->wasted_size -= c->nextblock->wasted_size;
-					c->nextblock->free_size = c->nextblock->wasted_size = 0;
-					if (VERYDIRTY(c, c->nextblock->dirty_size)) {
-						list_add(&c->nextblock->list, &c->very_dirty_list);
-					} else {
-						list_add(&c->nextblock->list, &c->dirty_list);
-					}
+					ret = file_dirty(c, c->nextblock);
+					if (ret)
+						return ret;
 					/* deleting summary information of the old nextblock */
 					jffs2_sum_reset_collected(c->summary);
 				}
-				/* update collected summary infromation for the current nextblock */
+				/* update collected summary information for the current nextblock */
 				jffs2_sum_move_collected(c, s);
 				D1(printk(KERN_DEBUG "jffs2_scan_medium(): new nextblock = 0x%08x\n", jeb->offset));
 				c->nextblock = jeb;
 			} else {
-				jeb->dirty_size += jeb->free_size + jeb->wasted_size;
-				c->dirty_size += jeb->free_size + jeb->wasted_size;
-				c->free_size -= jeb->free_size;
-				c->wasted_size -= jeb->wasted_size;
-				jeb->free_size = jeb->wasted_size = 0;
-				if (VERYDIRTY(c, jeb->dirty_size)) {
-					list_add(&jeb->list, &c->very_dirty_list);
-				} else {
-					list_add(&jeb->list, &c->dirty_list);
-				}
+				ret = file_dirty(c, jeb);
+				if (ret)
+					return ret;
 			}
 			break;
 
@@ -222,9 +230,6 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		}
 	}
 
-	if (jffs2_sum_active() && s)
-		kfree(s);
-
 	/* Nextblock dirty is always seen as wasted, because we cannot recycle it now */
 	if (c->nextblock && (c->nextblock->dirty_size)) {
 		c->nextblock->wasted_size += c->nextblock->dirty_size;
@@ -242,11 +247,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 
 		D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n",
 			  skip));
-		c->nextblock->wasted_size += skip;
-		c->wasted_size += skip;
-
-		c->nextblock->free_size -= skip;
-		c->free_size -= skip;
+		jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+		jffs2_scan_dirty_space(c, c->nextblock, skip);
 	}
 #endif
 	if (c->nr_erasing_blocks) {
@@ -266,6 +268,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 	else
 		c->mtd->unpoint(c->mtd, flashbuf, 0, c->mtd->size);
 #endif
+	if (s)
+		kfree(s);
+
 	return ret;
 }
 
@@ -290,7 +295,7 @@ int jffs2_fill_scan_buf (struct jffs2_sb_info *c, void *buf,
 int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
 	if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size
-		&& (!jeb->first_node || !jeb->first_node->next_phys) )
+	    && (!jeb->first_node || !ref_next(jeb->first_node)) )
 		return BLK_STATE_CLEANMARKER;
 
 	/* move blocks with max 4 byte dirty space to cleanlist */
@@ -306,11 +311,119 @@ int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
 		return BLK_STATE_ALLDIRTY;
 }
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_xattr *rx, uint32_t ofs,
+				 struct jffs2_summary *s)
+{
+	struct jffs2_xattr_datum *xd;
+	uint32_t totlen, crc;
+	int err;
+
+	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
+	if (crc != je32_to_cpu(rx->node_crc)) {
+		if (je32_to_cpu(rx->node_crc) != 0xffffffff)
+			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				      ofs, je32_to_cpu(rx->node_crc), crc);
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
+		return 0;
+	}
+
+	totlen = PAD(sizeof(*rx) + rx->name_len + 1 + je16_to_cpu(rx->value_len));
+	if (totlen != je32_to_cpu(rx->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+			      ofs, je32_to_cpu(rx->totlen), totlen);
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
+		return 0;
+	}
+
+	xd = jffs2_setup_xattr_datum(c, je32_to_cpu(rx->xid), je32_to_cpu(rx->version));
+	if (IS_ERR(xd)) {
+		if (PTR_ERR(xd) == -EEXIST) {
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rx->totlen)))))
+				return err;
+			return 0;
+		}
+		return PTR_ERR(xd);
+	}
+	xd->xprefix = rx->xprefix;
+	xd->name_len = rx->name_len;
+	xd->value_len = je16_to_cpu(rx->value_len);
+	xd->data_crc = je32_to_cpu(rx->data_crc);
+
+	xd->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
+	/* FIXME */ xd->node->next_in_ino = (void *)xd;
+
+	if (jffs2_sum_active())
+		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
+	dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n",
+		  ofs, xd->xid, xd->version);
+	return 0;
+}
+
+static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				struct jffs2_raw_xref *rr, uint32_t ofs,
+				struct jffs2_summary *s)
+{
+	struct jffs2_xattr_ref *ref;
+	uint32_t crc;
+	int err;
+
+	crc = crc32(0, rr, sizeof(*rr) - 4);
+	if (crc != je32_to_cpu(rr->node_crc)) {
+		if (je32_to_cpu(rr->node_crc) != 0xffffffff)
+			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				      ofs, je32_to_cpu(rr->node_crc), crc);
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen)))))
+			return err;
+		return 0;
+	}
+
+	if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%zd\n",
+			      ofs, je32_to_cpu(rr->totlen),
+			      PAD(sizeof(struct jffs2_raw_xref)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen))))
+			return err;
+		return 0;
+	}
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return -ENOMEM;
+
+	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	 * ref->xid is used to store 32bit xid, xd is not used
+	 * ref->ino is used to store 32bit inode-number, ic is not used
+	 * Thoes variables are declared as union, thus using those
+	 * are exclusive. In a similar way, ref->next is temporarily
+	 * used to chain all xattr_ref object. It's re-chained to
+	 * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
+	 */
+	ref->ino = je32_to_cpu(rr->ino);
+	ref->xid = je32_to_cpu(rr->xid);
+	ref->next = c->xref_temp;
+	c->xref_temp = ref;
+
+	ref->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), NULL);
+	/* FIXME */ ref->node->next_in_ino = (void *)ref;
+
+	if (jffs2_sum_active())
+		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
+	dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n",
+		  ofs, ref->xid, ref->ino);
+	return 0;
+}
+#endif
+
+/* Called with 'buf_size == 0' if buf is in fact a pointer _directly_ into
+   the flash, XIP-style */
 static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-				unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
+				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
 	struct jffs2_unknown_node *node;
 	struct jffs2_unknown_node crcnode;
-	struct jffs2_sum_marker *sm;
 	uint32_t ofs, prevofs;
 	uint32_t hdr_crc, buf_ofs, buf_len;
 	int err;
@@ -344,44 +457,75 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 #endif
 
 	if (jffs2_sum_active()) {
-		sm = kmalloc(sizeof(struct jffs2_sum_marker), GFP_KERNEL);
-		if (!sm) {
-			return -ENOMEM;
-		}
-
-		err = jffs2_fill_scan_buf(c, (unsigned char *) sm, jeb->offset + c->sector_size -
-					sizeof(struct jffs2_sum_marker), sizeof(struct jffs2_sum_marker));
-		if (err) {
-			kfree(sm);
-			return err;
-		}
-
-		if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC ) {
-			err = jffs2_sum_scan_sumnode(c, jeb, je32_to_cpu(sm->offset), &pseudo_random);
-			if (err) {
-				kfree(sm);
+		struct jffs2_sum_marker *sm;
+		void *sumptr = NULL;
+		uint32_t sumlen;
+	      
+		if (!buf_size) {
+			/* XIP case. Just look, point at the summary if it's there */
+			sm = (void *)buf + c->sector_size - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumptr = buf + je32_to_cpu(sm->offset);
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+			}
+		} else {
+			/* If NAND flash, read a whole page of it. Else just the end */
+			if (c->wbuf_pagesize)
+				buf_len = c->wbuf_pagesize;
+			else
+				buf_len = sizeof(*sm);
+
+			/* Read as much as we want into the _end_ of the preallocated buffer */
+			err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, 
+						  jeb->offset + c->sector_size - buf_len,
+						  buf_len);				
+			if (err)
 				return err;
+
+			sm = (void *)buf + buf_size - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+				sumptr = buf + buf_size - sumlen;
+
+				/* Now, make sure the summary itself is available */
+				if (sumlen > buf_size) {
+					/* Need to kmalloc for this. */
+					sumptr = kmalloc(sumlen, GFP_KERNEL);
+					if (!sumptr)
+						return -ENOMEM;
+					memcpy(sumptr + sumlen - buf_len, buf + buf_size - buf_len, buf_len);
+				}
+				if (buf_len < sumlen) {
+					/* Need to read more so that the entire summary node is present */
+					err = jffs2_fill_scan_buf(c, sumptr, 
+								  jeb->offset + c->sector_size - sumlen,
+								  sumlen - buf_len);				
+					if (err)
+						return err;
+				}
 			}
+
 		}
 
-		kfree(sm);
+		if (sumptr) {
+			err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random);
 
-		ofs = jeb->offset;
-		prevofs = jeb->offset - 1;
+			if (buf_size && sumlen > buf_size)
+				kfree(sumptr);
+			/* If it returns with a real error, bail. 
+			   If it returns positive, that's a block classification
+			   (i.e. BLK_STATE_xxx) so return that too.
+			   If it returns zero, fall through to full scan. */
+			if (err)
+				return err;
+		}
 	}
 
 	buf_ofs = jeb->offset;
 
 	if (!buf_size) {
+		/* This is the XIP case -- we're reading _directly_ from the flash chip */
 		buf_len = c->sector_size;
-
-		if (jffs2_sum_active()) {
-			/* must reread because of summary test */
-			err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
-			if (err)
-				return err;
-		}
-
 	} else {
 		buf_len = EMPTY_SCAN_SIZE(c->sector_size);
 		err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
@@ -418,7 +562,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 	if (ofs) {
 		D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
 			  jeb->offset + ofs));
-		DIRTY_SPACE(ofs);
+		if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+			return err;
+		if ((err = jffs2_scan_dirty_space(c, jeb, ofs)))
+			return err;
 	}
 
 	/* Now ofs is a complete physical flash offset as it always was... */
@@ -433,6 +580,11 @@ scan_more:
 
 		jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 
+		/* Make sure there are node refs available for use */
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+		if (err)
+			return err;
+
 		cond_resched();
 
 		if (ofs & 3) {
@@ -442,7 +594,8 @@ scan_more:
 		}
 		if (ofs == prevofs) {
 			printk(KERN_WARNING "ofs 0x%08x has already been seen. Skipping\n", ofs);
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -451,7 +604,8 @@ scan_more:
 		if (jeb->offset + c->sector_size < ofs + sizeof(*node)) {
 			D1(printk(KERN_DEBUG "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", sizeof(struct jffs2_unknown_node),
 				  jeb->offset, c->sector_size, ofs, sizeof(*node)));
-			DIRTY_SPACE((jeb->offset + c->sector_size)-ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs)))
+				return err;
 			break;
 		}
 
@@ -481,7 +635,8 @@ scan_more:
 				if (*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff) {
 					printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
 					       empty_start, ofs);
-					DIRTY_SPACE(ofs-empty_start);
+					if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
+						return err;
 					goto scan_more;
 				}
 
@@ -494,7 +649,7 @@ scan_more:
 			/* If we're only checking the beginning of a block with a cleanmarker,
 			   bail now */
 			if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) &&
-			    c->cleanmarker_size && !jeb->dirty_size && !jeb->first_node->next_phys) {
+			    c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) {
 				D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
 				return BLK_STATE_CLEANMARKER;
 			}
@@ -518,20 +673,23 @@ scan_more:
 
 		if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) {
 			printk(KERN_WARNING "Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", ofs);
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
 		if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) {
 			D1(printk(KERN_DEBUG "Dirty bitmask at 0x%08x\n", ofs));
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
 		if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) {
 			printk(KERN_WARNING "Old JFFS2 bitmask found at 0x%08x\n", ofs);
 			printk(KERN_WARNING "You cannot use older JFFS2 filesystems with newer kernels\n");
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -540,7 +698,8 @@ scan_more:
 			noisy_printk(&noise, "jffs2_scan_eraseblock(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n",
 				     JFFS2_MAGIC_BITMASK, ofs,
 				     je16_to_cpu(node->magic));
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -557,7 +716,8 @@ scan_more:
 				     je32_to_cpu(node->totlen),
 				     je32_to_cpu(node->hdr_crc),
 				     hdr_crc);
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -568,7 +728,8 @@ scan_more:
 			printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
 			       ofs, je32_to_cpu(node->totlen));
 			printk(KERN_WARNING "Perhaps the file system was created with the wrong erase size?\n");
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -576,7 +737,8 @@ scan_more:
 		if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) {
 			/* Wheee. This is an obsoleted node */
 			D2(printk(KERN_DEBUG "Node at 0x%08x is obsolete. Skipping\n", ofs));
-			DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
 			ofs += PAD(je32_to_cpu(node->totlen));
 			continue;
 		}
@@ -614,30 +776,59 @@ scan_more:
 			ofs += PAD(je32_to_cpu(node->totlen));
 			break;
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+#endif	/* CONFIG_JFFS2_FS_XATTR */
+
 		case JFFS2_NODETYPE_CLEANMARKER:
 			D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs));
 			if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
 				printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n",
 				       ofs, je32_to_cpu(node->totlen), c->cleanmarker_size);
-				DIRTY_SPACE(PAD(sizeof(struct jffs2_unknown_node)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
 				ofs += PAD(sizeof(struct jffs2_unknown_node));
 			} else if (jeb->first_node) {
 				printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", ofs, jeb->offset);
-				DIRTY_SPACE(PAD(sizeof(struct jffs2_unknown_node)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
 				ofs += PAD(sizeof(struct jffs2_unknown_node));
 			} else {
-				struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
-				if (!marker_ref) {
-					printk(KERN_NOTICE "Failed to allocate node ref for clean marker\n");
-					return -ENOMEM;
-				}
-				marker_ref->next_in_ino = NULL;
-				marker_ref->next_phys = NULL;
-				marker_ref->flash_offset = ofs | REF_NORMAL;
-				marker_ref->__totlen = c->cleanmarker_size;
-				jeb->first_node = jeb->last_node = marker_ref;
+				jffs2_link_node_ref(c, jeb, ofs | REF_NORMAL, c->cleanmarker_size, NULL);
 
-				USED_SPACE(PAD(c->cleanmarker_size));
 				ofs += PAD(c->cleanmarker_size);
 			}
 			break;
@@ -645,7 +836,8 @@ scan_more:
 		case JFFS2_NODETYPE_PADDING:
 			if (jffs2_sum_active())
 				jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen));
-			DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
 			ofs += PAD(je32_to_cpu(node->totlen));
 			break;
 
@@ -656,7 +848,8 @@ scan_more:
 			        c->flags |= JFFS2_SB_FLAG_RO;
 				if (!(jffs2_is_readonly(c)))
 					return -EROFS;
-				DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
 				ofs += PAD(je32_to_cpu(node->totlen));
 				break;
 
@@ -666,15 +859,21 @@ scan_more:
 
 			case JFFS2_FEATURE_RWCOMPAT_DELETE:
 				D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
-				DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
 				ofs += PAD(je32_to_cpu(node->totlen));
 				break;
 
-			case JFFS2_FEATURE_RWCOMPAT_COPY:
+			case JFFS2_FEATURE_RWCOMPAT_COPY: {
 				D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
-				USED_SPACE(PAD(je32_to_cpu(node->totlen)));
+
+				jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL);
+
+				/* We can't summarise nodes we don't grok */
+				jffs2_sum_disable_collecting(s);
 				ofs += PAD(je32_to_cpu(node->totlen));
 				break;
+				}
 			}
 		}
 	}
@@ -687,9 +886,9 @@ scan_more:
 		}
 	}
 
-	D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x\n", jeb->offset,
-		  jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size));
-
+	D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
+		  jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size));
+	
 	/* mark_node_obsolete can add to wasted !! */
 	if (jeb->wasted_size) {
 		jeb->dirty_size += jeb->wasted_size;
@@ -730,9 +929,9 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin
 static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_inode_cache *ic;
 	uint32_t ino = je32_to_cpu(ri->ino);
+	int err;
 
 	D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
 
@@ -745,12 +944,6 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	   Which means that the _full_ amount of time to get to proper write mode with GC
 	   operational may actually be _longer_ than before. Sucks to be me. */
 
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw) {
-		printk(KERN_NOTICE "jffs2_scan_inode_node(): allocation of node reference failed\n");
-		return -ENOMEM;
-	}
-
 	ic = jffs2_get_ino_cache(c, ino);
 	if (!ic) {
 		/* Inocache get failed. Either we read a bogus ino# or it's just genuinely the
@@ -762,30 +955,17 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 			printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
 			       ofs, je32_to_cpu(ri->node_crc), crc);
 			/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-			DIRTY_SPACE(PAD(je32_to_cpu(ri->totlen)));
-			jffs2_free_raw_node_ref(raw);
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(ri->totlen)))))
+				return err;
 			return 0;
 		}
 		ic = jffs2_scan_make_ino_cache(c, ino);
-		if (!ic) {
-			jffs2_free_raw_node_ref(raw);
+		if (!ic)
 			return -ENOMEM;
-		}
 	}
 
 	/* Wheee. It worked */
-
-	raw->flash_offset = ofs | REF_UNCHECKED;
-	raw->__totlen = PAD(je32_to_cpu(ri->totlen));
-	raw->next_phys = NULL;
-	raw->next_in_ino = ic->nodes;
-
-	ic->nodes = raw;
-	if (!jeb->first_node)
-		jeb->first_node = raw;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = raw;
-	jeb->last_node = raw;
+	jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic);
 
 	D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
 		  je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
@@ -794,8 +974,6 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 
 	pseudo_random += je32_to_cpu(ri->version);
 
-	UNCHECKED_SPACE(PAD(je32_to_cpu(ri->totlen)));
-
 	if (jffs2_sum_active()) {
 		jffs2_sum_add_inode_mem(s, ri, ofs - jeb->offset);
 	}
@@ -806,10 +984,10 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				  struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dirent *fd;
 	struct jffs2_inode_cache *ic;
 	uint32_t crc;
+	int err;
 
 	D1(printk(KERN_DEBUG "jffs2_scan_dirent_node(): Node at 0x%08x\n", ofs));
 
@@ -821,7 +999,8 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 		printk(KERN_NOTICE "jffs2_scan_dirent_node(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
 		       ofs, je32_to_cpu(rd->node_crc), crc);
 		/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-		DIRTY_SPACE(PAD(je32_to_cpu(rd->totlen)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
 		return 0;
 	}
 
@@ -842,40 +1021,23 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 		jffs2_free_full_dirent(fd);
 		/* FIXME: Why do we believe totlen? */
 		/* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */
-		DIRTY_SPACE(PAD(je32_to_cpu(rd->totlen)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
 		return 0;
 	}
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw) {
-		jffs2_free_full_dirent(fd);
-		printk(KERN_NOTICE "jffs2_scan_dirent_node(): allocation of node reference failed\n");
-		return -ENOMEM;
-	}
 	ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(rd->pino));
 	if (!ic) {
 		jffs2_free_full_dirent(fd);
-		jffs2_free_raw_node_ref(raw);
 		return -ENOMEM;
 	}
 
-	raw->__totlen = PAD(je32_to_cpu(rd->totlen));
-	raw->flash_offset = ofs | REF_PRISTINE;
-	raw->next_phys = NULL;
-	raw->next_in_ino = ic->nodes;
-	ic->nodes = raw;
-	if (!jeb->first_node)
-		jeb->first_node = raw;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = raw;
-	jeb->last_node = raw;
+	fd->raw = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rd->totlen)), ic);
 
-	fd->raw = raw;
 	fd->next = NULL;
 	fd->version = je32_to_cpu(rd->version);
 	fd->ino = je32_to_cpu(rd->ino);
 	fd->nhash = full_name_hash(fd->name, rd->nsize);
 	fd->type = rd->type;
-	USED_SPACE(PAD(je32_to_cpu(rd->totlen)));
 	jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
 
 	if (jffs2_sum_active()) {
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
new file mode 100644
index 000000000000..52a9894a6364
--- /dev/null
+++ b/fs/jffs2/security.c
@@ -0,0 +1,82 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include <linux/security.h>
+#include "nodelist.h"
+
+/* ---- Initial Security Label Attachment -------------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir)
+{
+	int rc;
+	size_t len;
+	void *value;
+	char *name;
+
+	rc = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			return 0;
+		return rc;
+	}
+	rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+
+        kfree(name);
+        kfree(value);
+        return rc;
+}
+
+/* ---- XATTR Handler for "security.*" ----------------- */
+static int jffs2_security_getxattr(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size);
+}
+
+static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer,
+				   size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags);
+}
+
+static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size,
+				       const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_SECURITY_PREFIX);
+		strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list = jffs2_security_listxattr,
+	.set = jffs2_security_setxattr,
+	.get = jffs2_security_getxattr
+};
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index fb9cec61fcf2..0b02fc79e4d1 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -5,6 +5,7 @@
  *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
  *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
  *                     University of Szeged, Hungary
+ *               2005  KaiGai Kohei <kaigai@ak.jp.nec.com>
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
@@ -81,6 +82,19 @@ static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item)
 			dbg_summary("dirent (%u) added to summary\n",
 						je32_to_cpu(item->d.ino));
 			break;
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			s->sum_size += JFFS2_SUMMARY_XATTR_SIZE;
+			s->sum_num++;
+			dbg_summary("xattr (xid=%u, version=%u) added to summary\n",
+				    je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			s->sum_size += JFFS2_SUMMARY_XREF_SIZE;
+			s->sum_num++;
+			dbg_summary("xref added to summary\n");
+			break;
+#endif
 		default:
 			JFFS2_WARNING("UNKNOWN node type %u\n",
 					    je16_to_cpu(item->u.nodetype));
@@ -141,6 +155,40 @@ int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *r
 	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
 }
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs)
+{
+	struct jffs2_sum_xattr_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rx->nodetype;
+	temp->xid = rx->xid;
+	temp->version = rx->version;
+	temp->offset = cpu_to_je32(ofs);
+	temp->totlen = rx->totlen;
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs)
+{
+	struct jffs2_sum_xref_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rr->nodetype;
+	temp->offset = cpu_to_je32(ofs);
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+#endif
 /* Cleanup every collected summary information */
 
 static void jffs2_sum_clean_collected(struct jffs2_summary *s)
@@ -259,7 +307,40 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
 
 			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
 		}
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR: {
+			struct jffs2_sum_xattr_mem *temp;
+			if (je32_to_cpu(node->x.version) == 0xffffffff)
+				return 0;
+			temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+
+			temp->nodetype = node->x.nodetype;
+			temp->xid = node->x.xid;
+			temp->version = node->x.version;
+			temp->totlen = node->x.totlen;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+		case JFFS2_NODETYPE_XREF: {
+			struct jffs2_sum_xref_mem *temp;
+
+			if (je32_to_cpu(node->r.ino) == 0xffffffff
+			    && je32_to_cpu(node->r.xid) == 0xffffffff)
+				return 0;
+			temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+			temp->nodetype = node->r.nodetype;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
 
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+#endif
 		case JFFS2_NODETYPE_PADDING:
 			dbg_summary("node PADDING\n");
 			c->summary->sum_padded += je32_to_cpu(node->u.totlen);
@@ -288,23 +369,41 @@ no_mem:
 	return -ENOMEM;
 }
 
+static struct jffs2_raw_node_ref *sum_link_node_ref(struct jffs2_sb_info *c,
+						    struct jffs2_eraseblock *jeb,
+						    uint32_t ofs, uint32_t len,
+						    struct jffs2_inode_cache *ic)
+{
+	/* If there was a gap, mark it dirty */
+	if ((ofs & ~3) > c->sector_size - jeb->free_size) {
+		/* Ew. Summary doesn't actually tell us explicitly about dirty space */
+		jffs2_scan_dirty_space(c, jeb, (ofs & ~3) - (c->sector_size - jeb->free_size));
+	}
+
+	return jffs2_link_node_ref(c, jeb, jeb->offset + ofs, len, ic);
+}
 
 /* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */
 
 static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				struct jffs2_raw_summary *summary, uint32_t *pseudo_random)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_inode_cache *ic;
 	struct jffs2_full_dirent *fd;
 	void *sp;
 	int i, ino;
+	int err;
 
 	sp = summary->sum;
 
 	for (i=0; i<je32_to_cpu(summary->sum_num); i++) {
 		dbg_summary("processing summary index %d\n", i);
 
+		/* Make sure there's a spare ref for dirty space */
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+		if (err)
+			return err;
+
 		switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) {
 			case JFFS2_NODETYPE_INODE: {
 				struct jffs2_sum_inode_flash *spi;
@@ -312,38 +411,20 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				ino = je32_to_cpu(spi->inode);
 
-				dbg_summary("Inode at 0x%08x\n",
-							jeb->offset + je32_to_cpu(spi->offset));
-
-				raw = jffs2_alloc_raw_node_ref();
-				if (!raw) {
-					JFFS2_NOTICE("allocation of node reference failed\n");
-					kfree(summary);
-					return -ENOMEM;
-				}
+				dbg_summary("Inode at 0x%08x-0x%08x\n",
+					    jeb->offset + je32_to_cpu(spi->offset),
+					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen));
 
 				ic = jffs2_scan_make_ino_cache(c, ino);
 				if (!ic) {
 					JFFS2_NOTICE("scan_make_ino_cache failed\n");
-					jffs2_free_raw_node_ref(raw);
-					kfree(summary);
 					return -ENOMEM;
 				}
 
-				raw->flash_offset = (jeb->offset + je32_to_cpu(spi->offset)) | REF_UNCHECKED;
-				raw->__totlen = PAD(je32_to_cpu(spi->totlen));
-				raw->next_phys = NULL;
-				raw->next_in_ino = ic->nodes;
-
-				ic->nodes = raw;
-				if (!jeb->first_node)
-					jeb->first_node = raw;
-				if (jeb->last_node)
-					jeb->last_node->next_phys = raw;
-				jeb->last_node = raw;
-				*pseudo_random += je32_to_cpu(spi->version);
+				sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED,
+						  PAD(je32_to_cpu(spi->totlen)), ic);
 
-				UNCHECKED_SPACE(PAD(je32_to_cpu(spi->totlen)));
+				*pseudo_random += je32_to_cpu(spi->version);
 
 				sp += JFFS2_SUMMARY_INODE_SIZE;
 
@@ -354,52 +435,33 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				struct jffs2_sum_dirent_flash *spd;
 				spd = sp;
 
-				dbg_summary("Dirent at 0x%08x\n",
-							jeb->offset + je32_to_cpu(spd->offset));
+				dbg_summary("Dirent at 0x%08x-0x%08x\n",
+					    jeb->offset + je32_to_cpu(spd->offset),
+					    jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen));
+
 
 				fd = jffs2_alloc_full_dirent(spd->nsize+1);
-				if (!fd) {
-					kfree(summary);
+				if (!fd)
 					return -ENOMEM;
-				}
 
 				memcpy(&fd->name, spd->name, spd->nsize);
 				fd->name[spd->nsize] = 0;
 
-				raw = jffs2_alloc_raw_node_ref();
-				if (!raw) {
-					jffs2_free_full_dirent(fd);
-					JFFS2_NOTICE("allocation of node reference failed\n");
-					kfree(summary);
-					return -ENOMEM;
-				}
-
 				ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino));
 				if (!ic) {
 					jffs2_free_full_dirent(fd);
-					jffs2_free_raw_node_ref(raw);
-					kfree(summary);
 					return -ENOMEM;
 				}
 
-				raw->__totlen = PAD(je32_to_cpu(spd->totlen));
-				raw->flash_offset = (jeb->offset + je32_to_cpu(spd->offset)) | REF_PRISTINE;
-				raw->next_phys = NULL;
-				raw->next_in_ino = ic->nodes;
-				ic->nodes = raw;
-				if (!jeb->first_node)
-					jeb->first_node = raw;
-				if (jeb->last_node)
-					jeb->last_node->next_phys = raw;
-				jeb->last_node = raw;
-
-				fd->raw = raw;
+				fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_UNCHECKED,
+							    PAD(je32_to_cpu(spd->totlen)), ic);
+
 				fd->next = NULL;
 				fd->version = je32_to_cpu(spd->version);
 				fd->ino = je32_to_cpu(spd->ino);
 				fd->nhash = full_name_hash(fd->name, spd->nsize);
 				fd->type = spd->type;
-				USED_SPACE(PAD(je32_to_cpu(spd->totlen)));
+
 				jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
 
 				*pseudo_random += je32_to_cpu(spd->version);
@@ -408,48 +470,105 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				break;
 			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_xattr_datum *xd;
+				struct jffs2_sum_xattr_flash *spx;
+
+				spx = (struct jffs2_sum_xattr_flash *)sp;
+				dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", 
+					    jeb->offset + je32_to_cpu(spx->offset),
+					    jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen),
+					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
+
+				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
+								je32_to_cpu(spx->version));
+				if (IS_ERR(xd)) {
+					if (PTR_ERR(xd) == -EEXIST) {
+						/* a newer version of xd exists */
+						if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(spx->totlen))))
+							return err;
+						sp += JFFS2_SUMMARY_XATTR_SIZE;
+						break;
+					}
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					return PTR_ERR(xd);
+				}
+
+				xd->node = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+							     PAD(je32_to_cpu(spx->totlen)), NULL);
+				/* FIXME */ xd->node->next_in_ino = (void *)xd;
+
+				*pseudo_random += je32_to_cpu(spx->xid);
+				sp += JFFS2_SUMMARY_XATTR_SIZE;
+
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_xattr_ref *ref;
+				struct jffs2_sum_xref_flash *spr;
+
+				spr = (struct jffs2_sum_xref_flash *)sp;
+				dbg_summary("xref at %#08x-%#08x\n",
+					    jeb->offset + je32_to_cpu(spr->offset),
+					    jeb->offset + je32_to_cpu(spr->offset) + 
+					    (uint32_t)PAD(sizeof(struct jffs2_raw_xref)));
+
+				ref = jffs2_alloc_xattr_ref();
+				if (!ref) {
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					return -ENOMEM;
+				}
+				ref->ino = 0xfffffffe;
+				ref->xid = 0xfffffffd;
+				ref->next = c->xref_temp;
+				c->xref_temp = ref;
 
+				ref->node = sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
+							      PAD(sizeof(struct jffs2_raw_xref)), NULL);
+				/* FIXME */ ref->node->next_in_ino = (void *)ref;
+
+				*pseudo_random += ref->node->flash_offset;
+				sp += JFFS2_SUMMARY_XREF_SIZE;
+
+				break;
+			}
+#endif
 			default : {
-				JFFS2_WARNING("Unsupported node type found in summary! Exiting...");
-				kfree(summary);
-				return -EIO;
+				uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype);
+				JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype);
+				if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT)
+					return -EIO;
+
+				/* For compatible node types, just fall back to the full scan */
+				c->wasted_size -= jeb->wasted_size;
+				c->free_size += c->sector_size - jeb->free_size;
+				c->used_size -= jeb->used_size;
+				c->dirty_size -= jeb->dirty_size;
+				jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0;
+				jeb->free_size = c->sector_size;
+
+				jffs2_free_jeb_node_refs(c, jeb);
+				return -ENOTRECOVERABLE;
 			}
 		}
 	}
-
-	kfree(summary);
 	return 0;
 }
 
 /* Process the summary node - called from jffs2_scan_eraseblock() */
-
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-				uint32_t ofs, uint32_t *pseudo_random)
+			   struct jffs2_raw_summary *summary, uint32_t sumsize,
+			   uint32_t *pseudo_random)
 {
 	struct jffs2_unknown_node crcnode;
-	struct jffs2_raw_node_ref *cache_ref;
-	struct jffs2_raw_summary *summary;
-	int ret, sumsize;
+	int ret, ofs;
 	uint32_t crc;
 
-	sumsize = c->sector_size - ofs;
-	ofs += jeb->offset;
+	ofs = c->sector_size - sumsize;
 
 	dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n",
-				jeb->offset, ofs, sumsize);
-
-	summary = kmalloc(sumsize, GFP_KERNEL);
-
-	if (!summary) {
-		return -ENOMEM;
-	}
-
-	ret = jffs2_fill_scan_buf(c, (unsigned char *)summary, ofs, sumsize);
-
-	if (ret) {
-		kfree(summary);
-		return ret;
-	}
+		    jeb->offset, jeb->offset + ofs, sumsize);
 
 	/* OK, now check for node validity and CRC */
 	crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -486,66 +605,49 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 
 		dbg_summary("Summary : CLEANMARKER node \n");
 
+		ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+		if (ret)
+			return ret;
+
 		if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) {
 			dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n",
 				je32_to_cpu(summary->cln_mkr), c->cleanmarker_size);
-			UNCHECKED_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
 		} else if (jeb->first_node) {
 			dbg_summary("CLEANMARKER node not first node in block "
 					"(0x%08x)\n", jeb->offset);
-			UNCHECKED_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
 		} else {
-			struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
-
-			if (!marker_ref) {
-				JFFS2_NOTICE("Failed to allocate node ref for clean marker\n");
-				kfree(summary);
-				return -ENOMEM;
-			}
-
-			marker_ref->next_in_ino = NULL;
-			marker_ref->next_phys = NULL;
-			marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-			marker_ref->__totlen = je32_to_cpu(summary->cln_mkr);
-			jeb->first_node = jeb->last_node = marker_ref;
-
-			USED_SPACE( PAD(je32_to_cpu(summary->cln_mkr)) );
+			jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL,
+					    je32_to_cpu(summary->cln_mkr), NULL);
 		}
 	}
 
-	if (je32_to_cpu(summary->padded)) {
-		DIRTY_SPACE(je32_to_cpu(summary->padded));
-	}
-
 	ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random);
+	/* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full
+	   scan of this eraseblock. So return zero */
+	if (ret == -ENOTRECOVERABLE)
+		return 0;
 	if (ret)
-		return ret;
+		return ret;		/* real error */
 
 	/* for PARANOIA_CHECK */
-	cache_ref = jffs2_alloc_raw_node_ref();
-
-	if (!cache_ref) {
-		JFFS2_NOTICE("Failed to allocate node ref for cache\n");
-		return -ENOMEM;
-	}
-
-	cache_ref->next_in_ino = NULL;
-	cache_ref->next_phys = NULL;
-	cache_ref->flash_offset = ofs | REF_NORMAL;
-	cache_ref->__totlen = sumsize;
-
-	if (!jeb->first_node)
-		jeb->first_node = cache_ref;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = cache_ref;
-	jeb->last_node = cache_ref;
+	ret = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+	if (ret)
+		return ret;
 
-	USED_SPACE(sumsize);
+	sum_link_node_ref(c, jeb, ofs | REF_NORMAL, sumsize, NULL);
 
-	jeb->wasted_size += jeb->free_size;
-	c->wasted_size += jeb->free_size;
-	c->free_size -= jeb->free_size;
-	jeb->free_size = 0;
+	if (unlikely(jeb->free_size)) {
+		JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
+			      jeb->free_size, jeb->offset);
+		jeb->wasted_size += jeb->free_size;
+		c->wasted_size += jeb->free_size;
+		c->free_size -= jeb->free_size;
+		jeb->free_size = 0;
+	}
 
 	return jffs2_scan_classify_jeb(c, jeb);
 
@@ -564,6 +666,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	union jffs2_sum_mem *temp;
 	struct jffs2_sum_marker *sm;
 	struct kvec vecs[2];
+	uint32_t sum_ofs;
 	void *wpage;
 	int ret;
 	size_t retlen;
@@ -581,16 +684,17 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	wpage = c->summary->sum_buf;
 
 	while (c->summary->sum_num) {
+		temp = c->summary->sum_list_head;
 
-		switch (je16_to_cpu(c->summary->sum_list_head->u.nodetype)) {
+		switch (je16_to_cpu(temp->u.nodetype)) {
 			case JFFS2_NODETYPE_INODE: {
 				struct jffs2_sum_inode_flash *sino_ptr = wpage;
 
-				sino_ptr->nodetype = c->summary->sum_list_head->i.nodetype;
-				sino_ptr->inode = c->summary->sum_list_head->i.inode;
-				sino_ptr->version = c->summary->sum_list_head->i.version;
-				sino_ptr->offset = c->summary->sum_list_head->i.offset;
-				sino_ptr->totlen = c->summary->sum_list_head->i.totlen;
+				sino_ptr->nodetype = temp->i.nodetype;
+				sino_ptr->inode = temp->i.inode;
+				sino_ptr->version = temp->i.version;
+				sino_ptr->offset = temp->i.offset;
+				sino_ptr->totlen = temp->i.totlen;
 
 				wpage += JFFS2_SUMMARY_INODE_SIZE;
 
@@ -600,30 +704,60 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 			case JFFS2_NODETYPE_DIRENT: {
 				struct jffs2_sum_dirent_flash *sdrnt_ptr = wpage;
 
-				sdrnt_ptr->nodetype = c->summary->sum_list_head->d.nodetype;
-				sdrnt_ptr->totlen = c->summary->sum_list_head->d.totlen;
-				sdrnt_ptr->offset = c->summary->sum_list_head->d.offset;
-				sdrnt_ptr->pino = c->summary->sum_list_head->d.pino;
-				sdrnt_ptr->version = c->summary->sum_list_head->d.version;
-				sdrnt_ptr->ino = c->summary->sum_list_head->d.ino;
-				sdrnt_ptr->nsize = c->summary->sum_list_head->d.nsize;
-				sdrnt_ptr->type = c->summary->sum_list_head->d.type;
+				sdrnt_ptr->nodetype = temp->d.nodetype;
+				sdrnt_ptr->totlen = temp->d.totlen;
+				sdrnt_ptr->offset = temp->d.offset;
+				sdrnt_ptr->pino = temp->d.pino;
+				sdrnt_ptr->version = temp->d.version;
+				sdrnt_ptr->ino = temp->d.ino;
+				sdrnt_ptr->nsize = temp->d.nsize;
+				sdrnt_ptr->type = temp->d.type;
 
-				memcpy(sdrnt_ptr->name, c->summary->sum_list_head->d.name,
-							c->summary->sum_list_head->d.nsize);
+				memcpy(sdrnt_ptr->name, temp->d.name,
+							temp->d.nsize);
 
-				wpage += JFFS2_SUMMARY_DIRENT_SIZE(c->summary->sum_list_head->d.nsize);
+				wpage += JFFS2_SUMMARY_DIRENT_SIZE(temp->d.nsize);
 
 				break;
 			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_sum_xattr_flash *sxattr_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxattr_ptr->nodetype = temp->x.nodetype;
+				sxattr_ptr->xid = temp->x.xid;
+				sxattr_ptr->version = temp->x.version;
+				sxattr_ptr->offset = temp->x.offset;
+				sxattr_ptr->totlen = temp->x.totlen;
+
+				wpage += JFFS2_SUMMARY_XATTR_SIZE;
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_sum_xref_flash *sxref_ptr = wpage;
 
+				temp = c->summary->sum_list_head;
+				sxref_ptr->nodetype = temp->r.nodetype;
+				sxref_ptr->offset = temp->r.offset;
+
+				wpage += JFFS2_SUMMARY_XREF_SIZE;
+				break;
+			}
+#endif
 			default : {
-				BUG();	/* unknown node in summary information */
+				if ((je16_to_cpu(temp->u.nodetype) & JFFS2_COMPAT_MASK)
+				    == JFFS2_FEATURE_RWCOMPAT_COPY) {
+					dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n",
+						    je16_to_cpu(temp->u.nodetype));
+					jffs2_sum_disable_collecting(c->summary);
+				} else {
+					BUG();	/* unknown node in summary information */
+				}
 			}
 		}
 
-		temp = c->summary->sum_list_head;
-		c->summary->sum_list_head = c->summary->sum_list_head->u.next;
+		c->summary->sum_list_head = temp->u.next;
 		kfree(temp);
 
 		c->summary->sum_num--;
@@ -645,25 +779,34 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	vecs[1].iov_base = c->summary->sum_buf;
 	vecs[1].iov_len = datasize;
 
-	dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n",
-			jeb->offset + c->sector_size - jeb->free_size);
+	sum_ofs = jeb->offset + c->sector_size - jeb->free_size;
 
-	spin_unlock(&c->erase_completion_lock);
-	ret = jffs2_flash_writev(c, vecs, 2, jeb->offset + c->sector_size -
-				jeb->free_size, &retlen, 0);
-	spin_lock(&c->erase_completion_lock);
+	dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n",
+		    sum_ofs);
 
+	ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0);
 
 	if (ret || (retlen != infosize)) {
-		JFFS2_WARNING("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
-			infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
+
+		JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
+			      infosize, sum_ofs, ret, retlen);
+
+		if (retlen) {
+			/* Waste remaining space */
+			spin_lock(&c->erase_completion_lock);
+			jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL);
+			spin_unlock(&c->erase_completion_lock);
+		}
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
-		WASTED_SPACE(infosize);
 
-		return 1;
+		return 0;
 	}
 
+	spin_lock(&c->erase_completion_lock);
+	jffs2_link_node_ref(c, jeb, sum_ofs | REF_NORMAL, infosize, NULL);
+	spin_unlock(&c->erase_completion_lock);
+
 	return 0;
 }
 
@@ -671,13 +814,16 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 {
-	struct jffs2_raw_node_ref *summary_ref;
-	int datasize, infosize, padsize, ret;
+	int datasize, infosize, padsize;
 	struct jffs2_eraseblock *jeb;
+	int ret;
 
 	dbg_summary("called\n");
 
+	spin_unlock(&c->erase_completion_lock);
+
 	jeb = c->nextblock;
+	jffs2_prealloc_raw_node_refs(c, jeb, 1);
 
 	if (!c->summary->sum_num || !c->summary->sum_list_head) {
 		JFFS2_WARNING("Empty summary info!!!\n");
@@ -696,35 +842,11 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 		jffs2_sum_disable_collecting(c->summary);
 
 		JFFS2_WARNING("Not enough space for summary, padsize = %d\n", padsize);
+		spin_lock(&c->erase_completion_lock);
 		return 0;
 	}
 
 	ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
-	if (ret)
-		return 0; /* can't write out summary, block is marked as NOSUM_SIZE */
-
-	/* for ACCT_PARANOIA_CHECK */
-	spin_unlock(&c->erase_completion_lock);
-	summary_ref = jffs2_alloc_raw_node_ref();
 	spin_lock(&c->erase_completion_lock);
-
-	if (!summary_ref) {
-		JFFS2_NOTICE("Failed to allocate node ref for summary\n");
-		return -ENOMEM;
-	}
-
-	summary_ref->next_in_ino = NULL;
-	summary_ref->next_phys = NULL;
-	summary_ref->flash_offset = (jeb->offset + c->sector_size - jeb->free_size) | REF_NORMAL;
-	summary_ref->__totlen = infosize;
-
-	if (!jeb->first_node)
-		jeb->first_node = summary_ref;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = summary_ref;
-	jeb->last_node = summary_ref;
-
-	USED_SPACE(infosize);
-
-	return 0;
+	return ret;
 }
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index b7a678be1709..6bf1f6aa4552 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -18,23 +18,6 @@
 #include <linux/uio.h>
 #include <linux/jffs2.h>
 
-#define DIRTY_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->dirty_size += _x; \
-		jeb->free_size -= _x ; jeb->dirty_size += _x; \
-		}while(0)
-#define USED_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->used_size += _x; \
-		jeb->free_size -= _x ; jeb->used_size += _x; \
-		}while(0)
-#define WASTED_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->wasted_size += _x; \
-		jeb->free_size -= _x ; jeb->wasted_size += _x; \
-		}while(0)
-#define UNCHECKED_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->unchecked_size += _x; \
-		jeb->free_size -= _x ; jeb->unchecked_size += _x; \
-		}while(0)
-
 #define BLK_STATE_ALLFF		0
 #define BLK_STATE_CLEAN		1
 #define BLK_STATE_PARTDIRTY	2
@@ -45,6 +28,8 @@
 #define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff
 #define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash))
 #define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x))
+#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash))
+#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash))
 
 /* Summary structures used on flash */
 
@@ -75,11 +60,28 @@ struct jffs2_sum_dirent_flash
 	uint8_t name[0];	/* dirent name */
 } __attribute__((packed));
 
+struct jffs2_sum_xattr_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XATR */
+	jint32_t xid;		/* xattr identifier */
+	jint32_t version;	/* version number */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen;	/* node length */
+} __attribute__((packed));
+
+struct jffs2_sum_xref_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XREF */
+	jint32_t offset;	/* offset on jeb */
+} __attribute__((packed));
+
 union jffs2_sum_flash
 {
 	struct jffs2_sum_unknown_flash u;
 	struct jffs2_sum_inode_flash i;
 	struct jffs2_sum_dirent_flash d;
+	struct jffs2_sum_xattr_flash x;
+	struct jffs2_sum_xref_flash r;
 };
 
 /* Summary structures used in the memory */
@@ -114,11 +116,30 @@ struct jffs2_sum_dirent_mem
 	uint8_t name[0];	/* dirent name */
 } __attribute__((packed));
 
+struct jffs2_sum_xattr_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t xid;
+	jint32_t version;
+	jint32_t offset;
+	jint32_t totlen;
+} __attribute__((packed));
+
+struct jffs2_sum_xref_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t offset;
+} __attribute__((packed));
+
 union jffs2_sum_mem
 {
 	struct jffs2_sum_unknown_mem u;
 	struct jffs2_sum_inode_mem i;
 	struct jffs2_sum_dirent_mem d;
+	struct jffs2_sum_xattr_mem x;
+	struct jffs2_sum_xref_mem r;
 };
 
 /* Summary related information stored in superblock */
@@ -159,8 +180,11 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c);
 int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size);
 int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs);
 int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs);
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs);
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs);
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-			uint32_t ofs, uint32_t *pseudo_random);
+			   struct jffs2_raw_summary *summary, uint32_t sumlen,
+			   uint32_t *pseudo_random);
 
 #else				/* SUMMARY DISABLED */
 
@@ -176,7 +200,9 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 #define jffs2_sum_add_padding_mem(a,b)
 #define jffs2_sum_add_inode_mem(a,b,c)
 #define jffs2_sum_add_dirent_mem(a,b,c)
-#define jffs2_sum_scan_sumnode(a,b,c,d) (0)
+#define jffs2_sum_add_xattr_mem(a,b,c)
+#define jffs2_sum_add_xref_mem(a,b,c)
+#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0)
 
 #endif /* CONFIG_JFFS2_SUMMARY */
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ffd8e84b22cc..2378a662c256 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -111,9 +111,10 @@ static int jffs2_sb_set(struct super_block *sb, void *data)
 	return 0;
 }
 
-static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, struct mtd_info *mtd)
+static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
+			    int flags, const char *dev_name,
+			    void *data, struct mtd_info *mtd,
+			    struct vfsmount *mnt)
 {
 	struct super_block *sb;
 	struct jffs2_sb_info *c;
@@ -121,19 +122,20 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	memset(c, 0, sizeof(*c));
 	c->mtd = mtd;
 
 	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
 
 	if (IS_ERR(sb))
-		goto out_put;
+		goto out_error;
 
 	if (sb->s_root) {
 		/* New mountpoint for JFFS2 which is already mounted */
 		D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n",
 			  mtd->index, mtd->name));
+		ret = simple_set_mnt(mnt, sb);
 		goto out_put;
 	}
 
@@ -151,51 +153,57 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	sb->s_op = &jffs2_super_operations;
 	sb->s_flags = flags | MS_NOATIME;
-
+	sb->s_xattr = jffs2_xattr_handlers;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
 	ret = jffs2_do_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 
 	if (ret) {
 		/* Failure case... */
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
+out_error:
+	ret = PTR_ERR(sb);
  out_put:
 	kfree(c);
 	put_mtd_device(mtd);
 
-	return sb;
+	return ret;
 }
 
-static struct super_block *jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, int mtdnr)
+static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, int mtdnr,
+			      struct vfsmount *mnt)
 {
 	struct mtd_info *mtd;
 
 	mtd = get_mtd_device(NULL, mtdnr);
 	if (!mtd) {
 		D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr));
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 }
 
-static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int jffs2_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
 	int err;
 	struct nameidata nd;
 	int mtdnr;
 
 	if (!dev_name)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name));
 
@@ -217,7 +225,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 				mtd = get_mtd_device(NULL, mtdnr);
 				if (mtd) {
 					if (!strcmp(mtd->name, dev_name+4))
-						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 					put_mtd_device(mtd);
 				}
 			}
@@ -230,7 +238,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 			if (!*endptr) {
 				/* It was a valid number */
 				D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr));
-				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 			}
 		}
 	}
@@ -244,7 +252,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 		  err, nd.dentry->d_inode));
 
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	err = -EINVAL;
 
@@ -266,11 +274,11 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 	mtdnr = iminor(nd.dentry->d_inode);
 	path_release(&nd);
 
-	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 
 out:
 	path_release(&nd);
-	return ERR_PTR(err);
+	return err;
 }
 
 static void jffs2_put_super (struct super_block *sb)
@@ -293,6 +301,7 @@ static void jffs2_put_super (struct super_block *sb)
 		kfree(c->blocks);
 	jffs2_flash_cleanup(c);
 	kfree(c->inocache_list);
+	jffs2_clear_xattr_subsystem(c);
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
@@ -320,6 +329,18 @@ static int __init init_jffs2_fs(void)
 {
 	int ret;
 
+	/* Paranoia checks for on-medium structures. If we ask GCC
+	   to pack them with __attribute__((packed)) then it _also_
+	   assumes that they're not aligned -- so it emits crappy
+	   code on some architectures. Ideally we want an attribute
+	   which means just 'no padding', without the alignment
+	   thing. But GCC doesn't have that -- we have to just
+	   hope the structs are the right sizes, instead. */
+	BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+	BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+	BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+	BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+
 	printk(KERN_INFO "JFFS2 version 2.2."
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	       " (NAND)"
@@ -327,7 +348,7 @@ static int __init init_jffs2_fs(void)
 #ifdef CONFIG_JFFS2_SUMMARY
 	       " (SUMMARY) "
 #endif
-	       " (C) 2001-2003 Red Hat, Inc.\n");
+	       " (C) 2001-2006 Red Hat, Inc.\n");
 
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index d55754fe8925..fc211b6e9b03 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -24,7 +24,12 @@ struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
 	.follow_link =	jffs2_follow_link,
-	.setattr =	jffs2_setattr
+	.permission =	jffs2_permission,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4cebf0e57c46..a7f153f79ecb 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -156,69 +156,130 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		jffs2_erase_pending_trigger(c);
 	}
 
-	/* Adjust its size counts accordingly */
-	c->wasted_size += jeb->free_size;
-	c->free_size -= jeb->free_size;
-	jeb->wasted_size += jeb->free_size;
-	jeb->free_size = 0;
+	if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
+		uint32_t oldfree = jeb->free_size;
+
+		jffs2_link_node_ref(c, jeb, 
+				    (jeb->offset+c->sector_size-oldfree) | REF_OBSOLETE,
+				    oldfree, NULL);
+		/* convert to wasted */
+		c->wasted_size += oldfree;
+		jeb->wasted_size += oldfree;
+		c->dirty_size -= oldfree;
+		jeb->dirty_size -= oldfree;
+	}
 
 	jffs2_dbg_dump_block_lists_nolock(c);
 	jffs2_dbg_acct_sanity_check_nolock(c,jeb);
 	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 }
 
+static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info *c,
+							    struct jffs2_inode_info *f,
+							    struct jffs2_raw_node_ref *raw,
+							    union jffs2_node_union *node)
+{
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dirent *fd;
+
+	dbg_noderef("incore_replace_raw: node at %p is {%04x,%04x}\n",
+		    node, je16_to_cpu(node->u.magic), je16_to_cpu(node->u.nodetype));
+
+	BUG_ON(je16_to_cpu(node->u.magic) != 0x1985 &&
+	       je16_to_cpu(node->u.magic) != 0);
+
+	switch (je16_to_cpu(node->u.nodetype)) {
+	case JFFS2_NODETYPE_INODE:
+		if (f->metadata && f->metadata->raw == raw) {
+			dbg_noderef("Will replace ->raw in f->metadata at %p\n", f->metadata);
+			return &f->metadata->raw;
+		}
+		frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset));
+		BUG_ON(!frag);
+		/* Find a frag which refers to the full_dnode we want to modify */
+		while (!frag->node || frag->node->raw != raw) {
+			frag = frag_next(frag);
+			BUG_ON(!frag);
+		}
+		dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node);
+		return &frag->node->raw;
+
+	case JFFS2_NODETYPE_DIRENT:
+		for (fd = f->dents; fd; fd = fd->next) {
+			if (fd->raw == raw) {
+				dbg_noderef("Will replace ->raw in full_dirent at %p\n", fd);
+				return &fd->raw;
+			}
+		}
+		BUG();
+
+	default:
+		dbg_noderef("Don't care about replacing raw for nodetype %x\n",
+			    je16_to_cpu(node->u.nodetype));
+		break;
+	}
+	return NULL;
+}
+
 /* Recover from failure to write wbuf. Recover the nodes up to the
  * wbuf, not the one which we were starting to try to write. */
 
 static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 {
 	struct jffs2_eraseblock *jeb, *new_jeb;
-	struct jffs2_raw_node_ref **first_raw, **raw;
+	struct jffs2_raw_node_ref *raw, *next, *first_raw = NULL;
 	size_t retlen;
 	int ret;
+	int nr_refile = 0;
 	unsigned char *buf;
 	uint32_t start, end, ofs, len;
 
-	spin_lock(&c->erase_completion_lock);
-
 	jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
 
+	spin_lock(&c->erase_completion_lock);
 	jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
+	spin_unlock(&c->erase_completion_lock);
+
+	BUG_ON(!ref_obsolete(jeb->last_node));
 
 	/* Find the first node to be recovered, by skipping over every
 	   node which ends before the wbuf starts, or which is obsolete. */
-	first_raw = &jeb->first_node;
-	while (*first_raw &&
-	       (ref_obsolete(*first_raw) ||
-		(ref_offset(*first_raw)+ref_totlen(c, jeb, *first_raw)) < c->wbuf_ofs)) {
-		D1(printk(KERN_DEBUG "Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
-			  ref_offset(*first_raw), ref_flags(*first_raw),
-			  (ref_offset(*first_raw) + ref_totlen(c, jeb, *first_raw)),
-			  c->wbuf_ofs));
-		first_raw = &(*first_raw)->next_phys;
+	for (next = raw = jeb->first_node; next; raw = next) {
+		next = ref_next(raw);
+
+		if (ref_obsolete(raw) || 
+		    (next && ref_offset(next) <= c->wbuf_ofs)) {
+			dbg_noderef("Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
+				    ref_offset(raw), ref_flags(raw),
+				    (ref_offset(raw) + ref_totlen(c, jeb, raw)),
+				    c->wbuf_ofs);
+			continue;
+		}
+		dbg_noderef("First node to be recovered is at 0x%08x(%d)-0x%08x\n",
+			    ref_offset(raw), ref_flags(raw),
+			    (ref_offset(raw) + ref_totlen(c, jeb, raw)));
+
+		first_raw = raw;
+		break;
 	}
 
-	if (!*first_raw) {
+	if (!first_raw) {
 		/* All nodes were obsolete. Nothing to recover. */
 		D1(printk(KERN_DEBUG "No non-obsolete nodes to be recovered. Just filing block bad\n"));
-		spin_unlock(&c->erase_completion_lock);
+		c->wbuf_len = 0;
 		return;
 	}
 
-	start = ref_offset(*first_raw);
-	end = ref_offset(*first_raw) + ref_totlen(c, jeb, *first_raw);
-
-	/* Find the last node to be recovered */
-	raw = first_raw;
-	while ((*raw)) {
-		if (!ref_obsolete(*raw))
-			end = ref_offset(*raw) + ref_totlen(c, jeb, *raw);
+	start = ref_offset(first_raw);
+	end = ref_offset(jeb->last_node);
+	nr_refile = 1;
 
-		raw = &(*raw)->next_phys;
-	}
-	spin_unlock(&c->erase_completion_lock);
+	/* Count the number of refs which need to be copied */
+	while ((raw = ref_next(raw)) != jeb->last_node)
+		nr_refile++;
 
-	D1(printk(KERN_DEBUG "wbuf recover %08x-%08x\n", start, end));
+	dbg_noderef("wbuf recover %08x-%08x (%d bytes in %d nodes)\n",
+		    start, end, end - start, nr_refile);
 
 	buf = NULL;
 	if (start < c->wbuf_ofs) {
@@ -233,28 +294,37 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		}
 
 		/* Do the read... */
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->read_ecc(c->mtd, start, c->wbuf_ofs - start, &retlen, buf, NULL, c->oobinfo);
-		else
-			ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
+		ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
 
-		if (ret == -EBADMSG && retlen == c->wbuf_ofs - start) {
-			/* ECC recovered */
+		/* ECC recovered ? */
+		if ((ret == -EUCLEAN || ret == -EBADMSG) &&
+		    (retlen == c->wbuf_ofs - start))
 			ret = 0;
-		}
+
 		if (ret || retlen != c->wbuf_ofs - start) {
 			printk(KERN_CRIT "Old data are already lost in wbuf recovery. Data loss ensues.\n");
 
 			kfree(buf);
 			buf = NULL;
 		read_failed:
-			first_raw = &(*first_raw)->next_phys;
+			first_raw = ref_next(first_raw);
+			nr_refile--;
+			while (first_raw && ref_obsolete(first_raw)) {
+				first_raw = ref_next(first_raw);
+				nr_refile--;
+			}
+
 			/* If this was the only node to be recovered, give up */
-			if (!(*first_raw))
+			if (!first_raw) {
+				c->wbuf_len = 0;
 				return;
+			}
 
 			/* It wasn't. Go on and try to recover nodes complete in the wbuf */
-			start = ref_offset(*first_raw);
+			start = ref_offset(first_raw);
+			dbg_noderef("wbuf now recover %08x-%08x (%d bytes in %d nodes)\n",
+				    start, end, end - start, nr_refile);
+
 		} else {
 			/* Read succeeded. Copy the remaining data from the wbuf */
 			memcpy(buf + (c->wbuf_ofs - start), c->wbuf, end - c->wbuf_ofs);
@@ -263,14 +333,23 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	/* OK... we're to rewrite (end-start) bytes of data from first_raw onwards.
 	   Either 'buf' contains the data, or we find it in the wbuf */
 
-
 	/* ... and get an allocation of space from a shiny new block instead */
-	ret = jffs2_reserve_space_gc(c, end-start, &ofs, &len, JFFS2_SUMMARY_NOSUM_SIZE);
+	ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE);
 	if (ret) {
 		printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n");
 		kfree(buf);
 		return;
 	}
+
+	ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
+	if (ret) {
+		printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
+		kfree(buf);
+		return;
+	}
+
+	ofs = write_ofs(c);
+
 	if (end-start >= c->wbuf_pagesize) {
 		/* Need to do another write immediately, but it's possible
 		   that this is just because the wbuf itself is completely
@@ -288,36 +367,22 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		if (breakme++ == 20) {
 			printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
 			breakme = 0;
-			c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
-					  brokenbuf, NULL, c->oobinfo);
+			c->mtd->write(c->mtd, ofs, towrite, &retlen,
+				      brokenbuf);
 			ret = -EIO;
 		} else
 #endif
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
-						rewrite_buf, NULL, c->oobinfo);
-		else
-			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, rewrite_buf);
+			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
+					    rewrite_buf);
 
 		if (ret || retlen != towrite) {
 			/* Argh. We tried. Really we did. */
 			printk(KERN_CRIT "Recovery of wbuf failed due to a second write error\n");
 			kfree(buf);
 
-			if (retlen) {
-				struct jffs2_raw_node_ref *raw2;
-
-				raw2 = jffs2_alloc_raw_node_ref();
-				if (!raw2)
-					return;
+			if (retlen)
+				jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, first_raw), NULL);
 
-				raw2->flash_offset = ofs | REF_OBSOLETE;
-				raw2->__totlen = ref_totlen(c, jeb, *first_raw);
-				raw2->next_phys = NULL;
-				raw2->next_in_ino = NULL;
-
-				jffs2_add_physical_node_ref(c, raw2);
-			}
 			return;
 		}
 		printk(KERN_NOTICE "Recovery of wbuf succeeded to %08x\n", ofs);
@@ -326,12 +391,10 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		c->wbuf_ofs = ofs + towrite;
 		memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len);
 		/* Don't muck about with c->wbuf_inodes. False positives are harmless. */
-		kfree(buf);
 	} else {
 		/* OK, now we're left with the dregs in whichever buffer we're using */
 		if (buf) {
 			memcpy(c->wbuf, buf, end-start);
-			kfree(buf);
 		} else {
 			memmove(c->wbuf, c->wbuf + (start - c->wbuf_ofs), end - start);
 		}
@@ -343,62 +406,111 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	new_jeb = &c->blocks[ofs / c->sector_size];
 
 	spin_lock(&c->erase_completion_lock);
-	if (new_jeb->first_node) {
-		/* Odd, but possible with ST flash later maybe */
-		new_jeb->last_node->next_phys = *first_raw;
-	} else {
-		new_jeb->first_node = *first_raw;
-	}
-
-	raw = first_raw;
-	while (*raw) {
-		uint32_t rawlen = ref_totlen(c, jeb, *raw);
+	for (raw = first_raw; raw != jeb->last_node; raw = ref_next(raw)) {
+		uint32_t rawlen = ref_totlen(c, jeb, raw);
+		struct jffs2_inode_cache *ic;
+		struct jffs2_raw_node_ref *new_ref;
+		struct jffs2_raw_node_ref **adjust_ref = NULL;
+		struct jffs2_inode_info *f = NULL;
 
 		D1(printk(KERN_DEBUG "Refiling block of %08x at %08x(%d) to %08x\n",
-			  rawlen, ref_offset(*raw), ref_flags(*raw), ofs));
+			  rawlen, ref_offset(raw), ref_flags(raw), ofs));
+
+		ic = jffs2_raw_ref_to_ic(raw);
+
+		/* Ick. This XATTR mess should be fixed shortly... */
+		if (ic && ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			struct jffs2_xattr_datum *xd = (void *)ic;
+			BUG_ON(xd->node != raw);
+			adjust_ref = &xd->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_XATTR_REF) {
+			struct jffs2_xattr_datum *xr = (void *)ic;
+			BUG_ON(xr->node != raw);
+			adjust_ref = &xr->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_INODE_CACHE) {
+			struct jffs2_raw_node_ref **p = &ic->nodes;
+
+			/* Remove the old node from the per-inode list */
+			while (*p && *p != (void *)ic) {
+				if (*p == raw) {
+					(*p) = (raw->next_in_ino);
+					raw->next_in_ino = NULL;
+					break;
+				}
+				p = &((*p)->next_in_ino);
+			}
 
-		if (ref_obsolete(*raw)) {
-			/* Shouldn't really happen much */
-			new_jeb->dirty_size += rawlen;
-			new_jeb->free_size -= rawlen;
-			c->dirty_size += rawlen;
-		} else {
-			new_jeb->used_size += rawlen;
-			new_jeb->free_size -= rawlen;
+			if (ic->state == INO_STATE_PRESENT && !ref_obsolete(raw)) {
+				/* If it's an in-core inode, then we have to adjust any
+				   full_dirent or full_dnode structure to point to the
+				   new version instead of the old */
+				f = jffs2_gc_fetch_inode(c, ic->ino, ic->nlink);
+				if (IS_ERR(f)) {
+					/* Should never happen; it _must_ be present */
+					JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n",
+						    ic->ino, PTR_ERR(f));
+					BUG();
+				}
+				/* We don't lock f->sem. There's a number of ways we could
+				   end up in here with it already being locked, and nobody's
+				   going to modify it on us anyway because we hold the
+				   alloc_sem. We're only changing one ->raw pointer too,
+				   which we can get away with without upsetting readers. */
+				adjust_ref = jffs2_incore_replace_raw(c, f, raw,
+								      (void *)(buf?:c->wbuf) + (ref_offset(raw) - start));
+			} else if (unlikely(ic->state != INO_STATE_PRESENT &&
+					    ic->state != INO_STATE_CHECKEDABSENT &&
+					    ic->state != INO_STATE_GC)) {
+				JFFS2_ERROR("Inode #%u is in strange state %d!\n", ic->ino, ic->state);
+				BUG();
+			}
+		}
+
+		new_ref = jffs2_link_node_ref(c, new_jeb, ofs | ref_flags(raw), rawlen, ic);
+
+		if (adjust_ref) {
+			BUG_ON(*adjust_ref != raw);
+			*adjust_ref = new_ref;
+		}
+		if (f)
+			jffs2_gc_release_inode(c, f);
+
+		if (!ref_obsolete(raw)) {
 			jeb->dirty_size += rawlen;
 			jeb->used_size  -= rawlen;
 			c->dirty_size += rawlen;
+			c->used_size -= rawlen;
+			raw->flash_offset = ref_offset(raw) | REF_OBSOLETE;
+			BUG_ON(raw->next_in_ino);
 		}
-		c->free_size -= rawlen;
-		(*raw)->flash_offset = ofs | ref_flags(*raw);
 		ofs += rawlen;
-		new_jeb->last_node = *raw;
-
-		raw = &(*raw)->next_phys;
 	}
 
+	kfree(buf);
+
 	/* Fix up the original jeb now it's on the bad_list */
-	*first_raw = NULL;
-	if (first_raw == &jeb->first_node) {
-		jeb->last_node = NULL;
+	if (first_raw == jeb->first_node) {
 		D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
 		list_del(&jeb->list);
 		list_add(&jeb->list, &c->erase_pending_list);
 		c->nr_erasing_blocks++;
 		jffs2_erase_pending_trigger(c);
 	}
-	else
-		jeb->last_node = container_of(first_raw, struct jffs2_raw_node_ref, next_phys);
 
 	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
-        jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 
 	jffs2_dbg_acct_sanity_check_nolock(c, new_jeb);
-        jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
 
 	spin_unlock(&c->erase_completion_lock);
 
-	D1(printk(KERN_DEBUG "wbuf recovery completed OK\n"));
+	D1(printk(KERN_DEBUG "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", c->wbuf_ofs, c->wbuf_len));
+
 }
 
 /* Meaning of pad argument:
@@ -412,6 +524,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 
 static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 {
+	struct jffs2_eraseblock *wbuf_jeb;
 	int ret;
 	size_t retlen;
 
@@ -429,6 +542,10 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (!c->wbuf_len)	/* already checked c->wbuf above */
 		return 0;
 
+	wbuf_jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+	if (jffs2_prealloc_raw_node_refs(c, wbuf_jeb, c->nextblock->allocated_refs + 1))
+		return -ENOMEM;
+
 	/* claim remaining space on the page
 	   this happens, if we have a change to a new block,
 	   or if fsync forces us to flush the writebuffer.
@@ -458,15 +575,12 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (breakme++ == 20) {
 		printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
 		breakme = 0;
-		c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
-					&retlen, brokenbuf, NULL, c->oobinfo);
+		c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+			      brokenbuf);
 		ret = -EIO;
 	} else
 #endif
 
-	if (jffs2_cleanmarker_oob(c))
-		ret = c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf, NULL, c->oobinfo);
-	else
 		ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
 
 	if (ret || retlen != c->wbuf_pagesize) {
@@ -483,32 +597,34 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 		return ret;
 	}
 
-	spin_lock(&c->erase_completion_lock);
-
 	/* Adjust free size of the block if we padded. */
 	if (pad) {
-		struct jffs2_eraseblock *jeb;
-
-		jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+		uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
 
 		D1(printk(KERN_DEBUG "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n",
-			  (jeb==c->nextblock)?"next":"", jeb->offset));
+			  (wbuf_jeb==c->nextblock)?"next":"", wbuf_jeb->offset));
 
 		/* wbuf_pagesize - wbuf_len is the amount of space that's to be
 		   padded. If there is less free space in the block than that,
 		   something screwed up */
-		if (jeb->free_size < (c->wbuf_pagesize - c->wbuf_len)) {
+		if (wbuf_jeb->free_size < waste) {
 			printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
-			       c->wbuf_ofs, c->wbuf_len, c->wbuf_pagesize-c->wbuf_len);
+			       c->wbuf_ofs, c->wbuf_len, waste);
 			printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
-			       jeb->offset, jeb->free_size);
+			       wbuf_jeb->offset, wbuf_jeb->free_size);
 			BUG();
 		}
-		jeb->free_size -= (c->wbuf_pagesize - c->wbuf_len);
-		c->free_size -= (c->wbuf_pagesize - c->wbuf_len);
-		jeb->wasted_size += (c->wbuf_pagesize - c->wbuf_len);
-		c->wasted_size += (c->wbuf_pagesize - c->wbuf_len);
-	}
+
+		spin_lock(&c->erase_completion_lock);
+
+		jffs2_link_node_ref(c, wbuf_jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL);
+		/* FIXME: that made it count as dirty. Convert to wasted */
+		wbuf_jeb->dirty_size -= waste;
+		c->dirty_size -= waste;
+		wbuf_jeb->wasted_size += waste;
+		c->wasted_size += waste;
+	} else
+		spin_lock(&c->erase_completion_lock);
 
 	/* Stick any now-obsoleted blocks on the erase_pending_list */
 	jffs2_refile_wbuf_blocks(c);
@@ -603,20 +719,30 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c)
 
 	return ret;
 }
-int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino)
+
+static size_t jffs2_fill_wbuf(struct jffs2_sb_info *c, const uint8_t *buf,
+			      size_t len)
 {
-	struct kvec outvecs[3];
-	uint32_t totlen = 0;
-	uint32_t split_ofs = 0;
-	uint32_t old_totlen;
-	int ret, splitvec = -1;
-	int invec, outvec;
-	size_t wbuf_retlen;
-	unsigned char *wbuf_ptr;
-	size_t donelen = 0;
+	if (len && !c->wbuf_len && (len >= c->wbuf_pagesize))
+		return 0;
+
+	if (len > (c->wbuf_pagesize - c->wbuf_len))
+		len = c->wbuf_pagesize - c->wbuf_len;
+	memcpy(c->wbuf + c->wbuf_len, buf, len);
+	c->wbuf_len += (uint32_t) len;
+	return len;
+}
+
+int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
+		       unsigned long count, loff_t to, size_t *retlen,
+		       uint32_t ino)
+{
+	struct jffs2_eraseblock *jeb;
+	size_t wbuf_retlen, donelen = 0;
 	uint32_t outvec_to = to;
+	int ret, invec;
 
-	/* If not NAND flash, don't bother */
+	/* If not writebuffered flash, don't bother */
 	if (!jffs2_is_writebuffered(c))
 		return jffs2_flash_direct_writev(c, invecs, count, to, retlen);
 
@@ -629,34 +755,22 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
 		memset(c->wbuf,0xff,c->wbuf_pagesize);
 	}
 
-	/* Fixup the wbuf if we are moving to a new eraseblock.  The checks below
-	   fail for ECC'd NOR because cleanmarker == 16, so a block starts at
-	   xxx0010.  */
-	if (jffs2_nor_ecc(c)) {
-		if (((c->wbuf_ofs % c->sector_size) == 0) && !c->wbuf_len) {
-			c->wbuf_ofs = PAGE_DIV(to);
-			c->wbuf_len = PAGE_MOD(to);
-			memset(c->wbuf,0xff,c->wbuf_pagesize);
-		}
-	}
-
-	/* Sanity checks on target address.
-	   It's permitted to write at PAD(c->wbuf_len+c->wbuf_ofs),
-	   and it's permitted to write at the beginning of a new
-	   erase block. Anything else, and you die.
-	   New block starts at xxx000c (0-b = block header)
-	*/
+	/*
+	 * Sanity checks on target address.  It's permitted to write
+	 * at PAD(c->wbuf_len+c->wbuf_ofs), and it's permitted to
+	 * write at the beginning of a new erase block. Anything else,
+	 * and you die.  New block starts at xxx000c (0-b = block
+	 * header)
+	 */
 	if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) {
 		/* It's a write to a new block */
 		if (c->wbuf_len) {
-			D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx causes flush of wbuf at 0x%08x\n", (unsigned long)to, c->wbuf_ofs));
+			D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx "
+				  "causes flush of wbuf at 0x%08x\n",
+				  (unsigned long)to, c->wbuf_ofs));
 			ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
-			if (ret) {
-				/* the underlying layer has to check wbuf_len to do the cleanup */
-				D1(printk(KERN_WARNING "jffs2_flush_wbuf() called from jffs2_flash_writev() failed %d\n", ret));
-				*retlen = 0;
-				goto exit;
-			}
+			if (ret)
+				goto outerr;
 		}
 		/* set pointer to new block */
 		c->wbuf_ofs = PAGE_DIV(to);
@@ -665,165 +779,70 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
 
 	if (to != PAD(c->wbuf_ofs + c->wbuf_len)) {
 		/* We're not writing immediately after the writebuffer. Bad. */
-		printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write to %08lx\n", (unsigned long)to);
+		printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write "
+		       "to %08lx\n", (unsigned long)to);
 		if (c->wbuf_len)
 			printk(KERN_CRIT "wbuf was previously %08x-%08x\n",
-					  c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
+			       c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
 		BUG();
 	}
 
-	/* Note outvecs[3] above. We know count is never greater than 2 */
-	if (count > 2) {
-		printk(KERN_CRIT "jffs2_flash_writev(): count is %ld\n", count);
-		BUG();
-	}
-
-	invec = 0;
-	outvec = 0;
-
-	/* Fill writebuffer first, if already in use */
-	if (c->wbuf_len) {
-		uint32_t invec_ofs = 0;
-
-		/* adjust alignment offset */
-		if (c->wbuf_len != PAGE_MOD(to)) {
-			c->wbuf_len = PAGE_MOD(to);
-			/* take care of alignment to next page */
-			if (!c->wbuf_len)
-				c->wbuf_len = c->wbuf_pagesize;
-		}
-
-		while(c->wbuf_len < c->wbuf_pagesize) {
-			uint32_t thislen;
-
-			if (invec == count)
-				goto alldone;
-
-			thislen = c->wbuf_pagesize - c->wbuf_len;
-
-			if (thislen >= invecs[invec].iov_len)
-				thislen = invecs[invec].iov_len;
-
-			invec_ofs = thislen;
-
-			memcpy(c->wbuf + c->wbuf_len, invecs[invec].iov_base, thislen);
-			c->wbuf_len += thislen;
-			donelen += thislen;
-			/* Get next invec, if actual did not fill the buffer */
-			if (c->wbuf_len < c->wbuf_pagesize)
-				invec++;
-		}
-
-		/* write buffer is full, flush buffer */
-		ret = __jffs2_flush_wbuf(c, NOPAD);
-		if (ret) {
-			/* the underlying layer has to check wbuf_len to do the cleanup */
-			D1(printk(KERN_WARNING "jffs2_flush_wbuf() called from jffs2_flash_writev() failed %d\n", ret));
-			/* Retlen zero to make sure our caller doesn't mark the space dirty.
-			   We've already done everything that's necessary */
-			*retlen = 0;
-			goto exit;
-		}
-		outvec_to += donelen;
-		c->wbuf_ofs = outvec_to;
-
-		/* All invecs done ? */
-		if (invec == count)
-			goto alldone;
-
-		/* Set up the first outvec, containing the remainder of the
-		   invec we partially used */
-		if (invecs[invec].iov_len > invec_ofs) {
-			outvecs[0].iov_base = invecs[invec].iov_base+invec_ofs;
-			totlen = outvecs[0].iov_len = invecs[invec].iov_len-invec_ofs;
-			if (totlen > c->wbuf_pagesize) {
-				splitvec = outvec;
-				split_ofs = outvecs[0].iov_len - PAGE_MOD(totlen);
-			}
-			outvec++;
-		}
-		invec++;
-	}
-
-	/* OK, now we've flushed the wbuf and the start of the bits
-	   we have been asked to write, now to write the rest.... */
-
-	/* totlen holds the amount of data still to be written */
-	old_totlen = totlen;
-	for ( ; invec < count; invec++,outvec++ ) {
-		outvecs[outvec].iov_base = invecs[invec].iov_base;
-		totlen += outvecs[outvec].iov_len = invecs[invec].iov_len;
-		if (PAGE_DIV(totlen) != PAGE_DIV(old_totlen)) {
-			splitvec = outvec;
-			split_ofs = outvecs[outvec].iov_len - PAGE_MOD(totlen);
-			old_totlen = totlen;
+	/* adjust alignment offset */
+	if (c->wbuf_len != PAGE_MOD(to)) {
+		c->wbuf_len = PAGE_MOD(to);
+		/* take care of alignment to next page */
+		if (!c->wbuf_len) {
+			c->wbuf_len = c->wbuf_pagesize;
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
 		}
 	}
 
-	/* Now the outvecs array holds all the remaining data to write */
-	/* Up to splitvec,split_ofs is to be written immediately. The rest
-	   goes into the (now-empty) wbuf */
-
-	if (splitvec != -1) {
-		uint32_t remainder;
-
-		remainder = outvecs[splitvec].iov_len - split_ofs;
-		outvecs[splitvec].iov_len = split_ofs;
-
-		/* We did cross a page boundary, so we write some now */
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->writev_ecc(c->mtd, outvecs, splitvec+1, outvec_to, &wbuf_retlen, NULL, c->oobinfo);
-		else
-			ret = jffs2_flash_direct_writev(c, outvecs, splitvec+1, outvec_to, &wbuf_retlen);
-
-		if (ret < 0 || wbuf_retlen != PAGE_DIV(totlen)) {
-			/* At this point we have no problem,
-			   c->wbuf is empty. However refile nextblock to avoid
-			   writing again to same address.
-			*/
-			struct jffs2_eraseblock *jeb;
+	for (invec = 0; invec < count; invec++) {
+		int vlen = invecs[invec].iov_len;
+		uint8_t *v = invecs[invec].iov_base;
 
-			spin_lock(&c->erase_completion_lock);
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
 
-			jeb = &c->blocks[outvec_to / c->sector_size];
-			jffs2_block_refile(c, jeb, REFILE_ANYWAY);
-
-			*retlen = 0;
-			spin_unlock(&c->erase_completion_lock);
-			goto exit;
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
 		}
-
+		vlen -= wbuf_retlen;
+		outvec_to += wbuf_retlen;
 		donelen += wbuf_retlen;
-		c->wbuf_ofs = PAGE_DIV(outvec_to) + PAGE_DIV(totlen);
-
-		if (remainder) {
-			outvecs[splitvec].iov_base += split_ofs;
-			outvecs[splitvec].iov_len = remainder;
-		} else {
-			splitvec++;
+		v += wbuf_retlen;
+
+		if (vlen >= c->wbuf_pagesize) {
+			ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen),
+					    &wbuf_retlen, v);
+			if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
+				goto outfile;
+
+			vlen -= wbuf_retlen;
+			outvec_to += wbuf_retlen;
+			c->wbuf_ofs = outvec_to;
+			donelen += wbuf_retlen;
+			v += wbuf_retlen;
 		}
 
-	} else {
-		splitvec = 0;
-	}
-
-	/* Now splitvec points to the start of the bits we have to copy
-	   into the wbuf */
-	wbuf_ptr = c->wbuf;
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
 
-	for ( ; splitvec < outvec; splitvec++) {
-		/* Don't copy the wbuf into itself */
-		if (outvecs[splitvec].iov_base == c->wbuf)
-			continue;
-		memcpy(wbuf_ptr, outvecs[splitvec].iov_base, outvecs[splitvec].iov_len);
-		wbuf_ptr += outvecs[splitvec].iov_len;
-		donelen += outvecs[splitvec].iov_len;
+		outvec_to += wbuf_retlen;
+		donelen += wbuf_retlen;
 	}
-	c->wbuf_len = wbuf_ptr - c->wbuf;
 
-	/* If there's a remainder in the wbuf and it's a non-GC write,
-	   remember that the wbuf affects this ino */
-alldone:
+	/*
+	 * If there's a remainder in the wbuf and it's a non-GC write,
+	 * remember that the wbuf affects this ino
+	 */
 	*retlen = donelen;
 
 	if (jffs2_sum_active()) {
@@ -836,8 +855,24 @@ alldone:
 		jffs2_wbuf_dirties_inode(c, ino);
 
 	ret = 0;
+	up_write(&c->wbuf_sem);
+	return ret;
 
-exit:
+outfile:
+	/*
+	 * At this point we have no problem, c->wbuf is empty. However
+	 * refile nextblock to avoid writing again to same address.
+	 */
+
+	spin_lock(&c->erase_completion_lock);
+
+	jeb = &c->blocks[outvec_to / c->sector_size];
+	jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+
+	spin_unlock(&c->erase_completion_lock);
+
+outerr:
+	*retlen = 0;
 	up_write(&c->wbuf_sem);
 	return ret;
 }
@@ -846,7 +881,8 @@ exit:
  *	This is the entry for flash write.
  *	Check, if we work on NAND FLASH, if so build an kvec and write it via vritev
 */
-int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf)
+int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+		      size_t *retlen, const u_char *buf)
 {
 	struct kvec vecs[1];
 
@@ -871,25 +907,23 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
 
 	/* Read flash */
 	down_read(&c->wbuf_sem);
-	if (jffs2_cleanmarker_oob(c))
-		ret = c->mtd->read_ecc(c->mtd, ofs, len, retlen, buf, NULL, c->oobinfo);
-	else
-		ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
-
-	if ( (ret == -EBADMSG) && (*retlen == len) ) {
-		printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n",
-		       len, ofs);
+	ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+
+	if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
+		if (ret == -EBADMSG)
+			printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx)"
+			       " returned ECC error\n", len, ofs);
 		/*
-		 * We have the raw data without ECC correction in the buffer, maybe
-		 * we are lucky and all data or parts are correct. We check the node.
-		 * If data are corrupted node check will sort it out.
-		 * We keep this block, it will fail on write or erase and the we
-		 * mark it bad. Or should we do that now? But we should give him a chance.
-		 * Maybe we had a system crash or power loss before the ecc write or
-		 * a erase was completed.
+		 * We have the raw data without ECC correction in the buffer,
+		 * maybe we are lucky and all data or parts are correct. We
+		 * check the node.  If data are corrupted node check will sort
+		 * it out.  We keep this block, it will fail on write or erase
+		 * and the we mark it bad. Or should we do that now? But we
+		 * should give him a chance.  Maybe we had a system crash or
+		 * power loss before the ecc write or a erase was completed.
 		 * So we return success. :)
 		 */
-	 	ret = 0;
+		ret = 0;
 	}
 
 	/* if no writebuffer available or write buffer empty, return */
@@ -911,7 +945,7 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
 		orbf = (c->wbuf_ofs - ofs);	/* offset in read buffer */
 		if (orbf > len)			/* is write beyond write buffer ? */
 			goto exit;
-		lwbf = len - orbf; 		/* number of bytes to copy */
+		lwbf = len - orbf;		/* number of bytes to copy */
 		if (lwbf > c->wbuf_len)
 			lwbf = c->wbuf_len;
 	}
@@ -923,158 +957,159 @@ exit:
 	return ret;
 }
 
+#define NR_OOB_SCAN_PAGES	4
+
 /*
- *	Check, if the out of band area is empty
+ * Check, if the out of band area is empty
  */
-int jffs2_check_oob_empty( struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int mode)
+int jffs2_check_oob_empty(struct jffs2_sb_info *c,
+			  struct jffs2_eraseblock *jeb, int mode)
 {
-	unsigned char *buf;
-	int 	ret = 0;
-	int	i,len,page;
-	size_t  retlen;
-	int	oob_size;
-
-	/* allocate a buffer for all oob data in this sector */
-	oob_size = c->mtd->oobsize;
-	len = 4 * oob_size;
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf) {
-		printk(KERN_NOTICE "jffs2_check_oob_empty(): allocation of temporary data buffer for oob check failed\n");
-		return -ENOMEM;
-	}
-	/*
-	 * if mode = 0, we scan for a total empty oob area, else we have
-	 * to take care of the cleanmarker in the first page of the block
-	*/
-	ret = jffs2_flash_read_oob(c, jeb->offset, len , &retlen, buf);
+	int i, page, ret;
+	int oobsize = c->mtd->oobsize;
+	struct mtd_oob_ops ops;
+
+	ops.len = NR_OOB_SCAN_PAGES * oobsize;
+	ops.ooblen = oobsize;
+	ops.oobbuf = c->oobbuf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
+
+	ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
 	if (ret) {
-		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
-		goto out;
+		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+			  "failed %d for block at %08x\n", ret, jeb->offset));
+		return ret;
 	}
 
-	if (retlen < len) {
-		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB return short read "
-			  "(%zd bytes not %d) for block at %08x\n", retlen, len, jeb->offset));
-		ret = -EIO;
-		goto out;
+	if (ops.retlen < ops.len) {
+		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+			  "returned short read (%zd bytes not %d) for block "
+			  "at %08x\n", ops.retlen, ops.len, jeb->offset));
+		return -EIO;
 	}
 
 	/* Special check for first page */
-	for(i = 0; i < oob_size ; i++) {
+	for(i = 0; i < oobsize ; i++) {
 		/* Yeah, we know about the cleanmarker. */
 		if (mode && i >= c->fsdata_pos &&
 		    i < c->fsdata_pos + c->fsdata_len)
 			continue;
 
-		if (buf[i] != 0xFF) {
-			D2(printk(KERN_DEBUG "Found %02x at %x in OOB for %08x\n",
-				  buf[i], i, jeb->offset));
-			ret = 1;
-			goto out;
+		if (ops.oobbuf[i] != 0xFF) {
+			D2(printk(KERN_DEBUG "Found %02x at %x in OOB for "
+				  "%08x\n", ops.oobbuf[i], i, jeb->offset));
+			return 1;
 		}
 	}
 
 	/* we know, we are aligned :) */
-	for (page = oob_size; page < len; page += sizeof(long)) {
-		unsigned long dat = *(unsigned long *)(&buf[page]);
-		if(dat != -1) {
-			ret = 1;
-			goto out;
-		}
+	for (page = oobsize; page < ops.len; page += sizeof(long)) {
+		long dat = *(long *)(&ops.oobbuf[page]);
+		if(dat != -1)
+			return 1;
 	}
-
-out:
-	kfree(buf);
-
-	return ret;
+	return 0;
 }
 
 /*
-*	Scan for a valid cleanmarker and for bad blocks
-*	For virtual blocks (concatenated physical blocks) check the cleanmarker
-*	only in the first page of the first physical block, but scan for bad blocks in all
-*	physical blocks
-*/
-int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+ * Scan for a valid cleanmarker and for bad blocks
+ */
+int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c,
+				  struct jffs2_eraseblock *jeb)
 {
 	struct jffs2_unknown_node n;
-	unsigned char buf[2 * NAND_MAX_OOBSIZE];
-	unsigned char *p;
-	int ret, i, cnt, retval = 0;
-	size_t retlen, offset;
-	int oob_size;
-
-	offset = jeb->offset;
-	oob_size = c->mtd->oobsize;
-
-	/* Loop through the physical blocks */
-	for (cnt = 0; cnt < (c->sector_size / c->mtd->erasesize); cnt++) {
-		/* Check first if the block is bad. */
-		if (c->mtd->block_isbad (c->mtd, offset)) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Bad block at %08x\n", jeb->offset));
-			return 2;
-		}
-		/*
-		   *    We read oob data from page 0 and 1 of the block.
-		   *    page 0 contains cleanmarker and badblock info
-		   *    page 1 contains failure count of this block
-		 */
-		ret = c->mtd->read_oob (c->mtd, offset, oob_size << 1, &retlen, buf);
+	struct mtd_oob_ops ops;
+	int oobsize = c->mtd->oobsize;
+	unsigned char *p,*b;
+	int i, ret;
+	size_t offset = jeb->offset;
+
+	/* Check first if the block is bad. */
+	if (c->mtd->block_isbad(c->mtd, offset)) {
+		D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker()"
+			   ": Bad block at %08x\n", jeb->offset));
+		return 2;
+	}
 
-		if (ret) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
-			return ret;
-		}
-		if (retlen < (oob_size << 1)) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB return short read (%zd bytes not %d) for block at %08x\n", retlen, oob_size << 1, jeb->offset));
-			return -EIO;
-		}
+	ops.len = oobsize;
+	ops.ooblen = oobsize;
+	ops.oobbuf = c->oobbuf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
 
-		/* Check cleanmarker only on the first physical block */
-		if (!cnt) {
-			n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
-			n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
-			n.totlen = cpu_to_je32 (8);
-			p = (unsigned char *) &n;
+	ret = c->mtd->read_oob(c->mtd, offset, &ops);
+	if (ret) {
+		D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+			   "Read OOB failed %d for block at %08x\n",
+			   ret, jeb->offset));
+		return ret;
+	}
 
-			for (i = 0; i < c->fsdata_len; i++) {
-				if (buf[c->fsdata_pos + i] != p[i]) {
-					retval = 1;
-				}
-			}
-			D1(if (retval == 1) {
-				printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): Cleanmarker node not detected in block at %08x\n", jeb->offset);
-				printk(KERN_WARNING "OOB at %08x was ", offset);
-				for (i=0; i < oob_size; i++) {
-					printk("%02x ", buf[i]);
-				}
-				printk("\n");
-			})
-		}
-		offset += c->mtd->erasesize;
+	if (ops.retlen < ops.len) {
+		D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+			    "Read OOB return short read (%zd bytes not %d) "
+			    "for block at %08x\n", ops.retlen, ops.len,
+			    jeb->offset));
+		return -EIO;
 	}
-	return retval;
+
+	n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
+	n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
+	n.totlen = cpu_to_je32 (8);
+	p = (unsigned char *) &n;
+	b = c->oobbuf + c->fsdata_pos;
+
+	for (i = c->fsdata_len; i; i--) {
+		if (*b++ != *p++)
+			ret = 1;
+	}
+
+	D1(if (ret == 1) {
+		printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+		       "Cleanmarker node not detected in block at %08x\n",
+		       offset);
+		printk(KERN_WARNING "OOB at %08zx was ", offset);
+		for (i=0; i < oobsize; i++)
+			printk("%02x ", c->oobbuf[i]);
+		printk("\n");
+	});
+	return ret;
 }
 
-int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb)
 {
-	struct 	jffs2_unknown_node n;
-	int 	ret;
-	size_t 	retlen;
+	struct jffs2_unknown_node n;
+	int	ret;
+	struct mtd_oob_ops ops;
 
 	n.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	n.nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER);
 	n.totlen = cpu_to_je32(8);
 
-	ret = jffs2_flash_write_oob(c, jeb->offset + c->fsdata_pos, c->fsdata_len, &retlen, (unsigned char *)&n);
+	ops.len = c->fsdata_len;
+	ops.ooblen = c->fsdata_len;;
+	ops.oobbuf = (uint8_t *)&n;
+	ops.ooboffs = c->fsdata_pos;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
+
+	ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
 
 	if (ret) {
-		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
+		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+			  "Write failed for block at %08x: error %d\n",
+			  jeb->offset, ret));
 		return ret;
 	}
-	if (retlen != c->fsdata_len) {
-		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Short write for block at %08x: %zd not %d\n", jeb->offset, retlen, c->fsdata_len));
-		return ret;
+	if (ops.retlen != ops.len) {
+		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+			  "Short write for block at %08x: %zd not %d\n",
+			  jeb->offset, ops.retlen, ops.len));
+		return -EIO;
 	}
 	return 0;
 }
@@ -1108,18 +1143,9 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
 	return 1;
 }
 
-#define NAND_JFFS2_OOB16_FSDALEN	8
-
-static struct nand_oobinfo jffs2_oobinfo_docecc = {
-	.useecc = MTD_NANDECC_PLACE,
-	.eccbytes = 6,
-	.eccpos = {0,1,2,3,4,5}
-};
-
-
 static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 {
-	struct nand_oobinfo *oinfo = &c->mtd->oobinfo;
+	struct nand_ecclayout *oinfo = c->mtd->ecclayout;
 
 	/* Do this only, if we have an oob buffer */
 	if (!c->mtd->oobsize)
@@ -1129,33 +1155,23 @@ static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 	c->cleanmarker_size = 0;
 
 	/* Should we use autoplacement ? */
-	if (oinfo && oinfo->useecc == MTD_NANDECC_AUTOPLACE) {
-		D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
-		/* Get the position of the free bytes */
-		if (!oinfo->oobfree[0][1]) {
-			printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep. Autoplacement selected and no empty space in oob\n");
-			return -ENOSPC;
-		}
-		c->fsdata_pos = oinfo->oobfree[0][0];
-		c->fsdata_len = oinfo->oobfree[0][1];
-		if (c->fsdata_len > 8)
-			c->fsdata_len = 8;
-	} else {
-		/* This is just a legacy fallback and should go away soon */
-		switch(c->mtd->ecctype) {
-		case MTD_ECC_RS_DiskOnChip:
-			printk(KERN_WARNING "JFFS2 using DiskOnChip hardware ECC without autoplacement. Fix it!\n");
-			c->oobinfo = &jffs2_oobinfo_docecc;
-			c->fsdata_pos = 6;
-			c->fsdata_len = NAND_JFFS2_OOB16_FSDALEN;
-			c->badblock_pos = 15;
-			break;
+	if (!oinfo) {
+		D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
+		return -EINVAL;
+	}
 
-		default:
-			D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
-			return -EINVAL;
-		}
+	D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
+	/* Get the position of the free bytes */
+	if (!oinfo->oobfree[0].length) {
+		printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep."
+			" Autoplacement selected and no empty space in oob\n");
+		return -ENOSPC;
 	}
+	c->fsdata_pos = oinfo->oobfree[0].offset;
+	c->fsdata_len = oinfo->oobfree[0].length;
+	if (c->fsdata_len > 8)
+		c->fsdata_len = 8;
+
 	return 0;
 }
 
@@ -1165,13 +1181,17 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 
 	/* Initialise write buffer */
 	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = c->mtd->oobblock;
+	c->wbuf_pagesize = c->mtd->writesize;
 	c->wbuf_ofs = 0xFFFFFFFF;
 
 	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
 	if (!c->wbuf)
 		return -ENOMEM;
 
+	c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->mtd->oobsize, GFP_KERNEL);
+	if (!c->oobbuf)
+		return -ENOMEM;
+
 	res = jffs2_nand_set_oobinfo(c);
 
 #ifdef BREAKME
@@ -1189,6 +1209,7 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
 {
 	kfree(c->wbuf);
+	kfree(c->oobbuf);
 }
 
 int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
@@ -1236,33 +1257,14 @@ void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) {
 	kfree(c->wbuf);
 }
 
-int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) {
-	/* Cleanmarker is actually larger on the flashes */
-	c->cleanmarker_size = 16;
-
-	/* Initialize write buffer */
-	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = c->mtd->eccsize;
-	c->wbuf_ofs = 0xFFFFFFFF;
-
-	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
-	if (!c->wbuf)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c) {
-	kfree(c->wbuf);
-}
-
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
-	/* Cleanmarker currently occupies a whole programming region */
-	c->cleanmarker_size = MTD_PROGREGION_SIZE(c->mtd);
+	/* Cleanmarker currently occupies whole programming regions,
+	 * either one or 2 for 8Byte STMicro flashes. */
+	c->cleanmarker_size = max(16u, c->mtd->writesize);
 
 	/* Initialize write buffer */
 	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = MTD_PROGREGION_SIZE(c->mtd);
+	c->wbuf_pagesize = c->mtd->writesize;
 	c->wbuf_ofs = 0xFFFFFFFF;
 
 	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 1342f0158e9b..67176792e138 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -37,7 +37,6 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
 	f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
 	f->inocache->state = INO_STATE_PRESENT;
 
-
 	jffs2_add_ino_cache(c, f->inocache);
 	D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino));
 	ri->ino = cpu_to_je32(f->inocache->ino);
@@ -57,12 +56,14 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
 /* jffs2_write_dnode - given a raw_inode, allocate a full_dnode for it,
    write it to the flash, link it into the existing inode/fragment list */
 
-struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const unsigned char *data, uint32_t datalen, uint32_t flash_ofs, int alloc_mode)
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode)
 
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dnode *fn;
 	size_t retlen;
+	uint32_t flash_ofs;
 	struct kvec vecs[2];
 	int ret;
 	int retried = 0;
@@ -78,34 +79,21 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	vecs[1].iov_base = (unsigned char *)data;
 	vecs[1].iov_len = datalen;
 
-	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
-
 	if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) {
 		printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen);
 	}
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw)
-		return ERR_PTR(-ENOMEM);
 
 	fn = jffs2_alloc_full_dnode();
-	if (!fn) {
-		jffs2_free_raw_node_ref(raw);
+	if (!fn)
 		return ERR_PTR(-ENOMEM);
-	}
-
-	fn->ofs = je32_to_cpu(ri->offset);
-	fn->size = je32_to_cpu(ri->dsize);
-	fn->frags = 0;
 
 	/* check number of valid vecs */
 	if (!datalen || !data)
 		cnt = 1;
  retry:
-	fn->raw = raw;
+	flash_ofs = write_ofs(c);
 
-	raw->flash_offset = flash_ofs;
-	raw->__totlen = PAD(sizeof(*ri)+datalen);
-	raw->next_phys = NULL;
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
 
 	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) {
 		BUG_ON(!retried);
@@ -125,22 +113,16 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 
 		/* Mark the space as dirtied */
 		if (retlen) {
-			/* Doesn't belong to any inode */
-			raw->next_in_ino = NULL;
-
 			/* Don't change raw->size to match retlen. We may have
 			   written the node header already, and only the data will
 			   seem corrupted, in which case the scan would skip over
 			   any node we write before the original intended end of
 			   this node */
-			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw);
-			jffs2_mark_node_obsolete(c, raw);
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL);
 		} else {
-			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
-			jffs2_free_raw_node_ref(raw);
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
 		}
-		if (!retried && alloc_mode != ALLOC_NORETRY && (raw = jffs2_alloc_raw_node_ref())) {
+		if (!retried && alloc_mode != ALLOC_NORETRY) {
 			/* Try to reallocate space and retry */
 			uint32_t dummy;
 			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
@@ -153,19 +135,20 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 			jffs2_dbg_acct_paranoia_check(c, jeb);
 
 			if (alloc_mode == ALLOC_GC) {
-				ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &flash_ofs,
-							&dummy, JFFS2_SUMMARY_INODE_SIZE);
+				ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &dummy,
+							     JFFS2_SUMMARY_INODE_SIZE);
 			} else {
 				/* Locking pain */
 				up(&f->sem);
 				jffs2_complete_reservation(c);
 
-				ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &flash_ofs,
-							&dummy, alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
+				ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
 				down(&f->sem);
 			}
 
 			if (!ret) {
+				flash_ofs = write_ofs(c);
 				D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
 
 				jffs2_dbg_acct_sanity_check(c,jeb);
@@ -174,7 +157,6 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 				goto retry;
 			}
 			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-			jffs2_free_raw_node_ref(raw);
 		}
 		/* Release the full_dnode which is now useless, and return */
 		jffs2_free_full_dnode(fn);
@@ -188,20 +170,17 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
 	    ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
 	      (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) ==  je32_to_cpu(ri->isize)))) {
-		raw->flash_offset |= REF_PRISTINE;
+		flash_ofs |= REF_PRISTINE;
 	} else {
-		raw->flash_offset |= REF_NORMAL;
+		flash_ofs |= REF_NORMAL;
 	}
-	jffs2_add_physical_node_ref(c, raw);
-
-	/* Link into per-inode list */
-	spin_lock(&c->erase_completion_lock);
-	raw->next_in_ino = f->inocache->nodes;
-	f->inocache->nodes = raw;
-	spin_unlock(&c->erase_completion_lock);
+	fn->raw = jffs2_add_physical_node_ref(c, flash_ofs, PAD(sizeof(*ri)+datalen), f->inocache);
+	fn->ofs = je32_to_cpu(ri->offset);
+	fn->size = je32_to_cpu(ri->dsize);
+	fn->frags = 0;
 
 	D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
-		  flash_ofs, ref_flags(raw), je32_to_cpu(ri->dsize),
+		  flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize),
 		  je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc),
 		  je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen)));
 
@@ -212,12 +191,14 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	return fn;
 }
 
-struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_dirent *rd, const unsigned char *name, uint32_t namelen, uint32_t flash_ofs, int alloc_mode)
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dirent *fd;
 	size_t retlen;
 	struct kvec vecs[2];
+	uint32_t flash_ofs;
 	int retried = 0;
 	int ret;
 
@@ -228,26 +209,16 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 	D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) {
 		printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dirent()\n");
 		BUG();
-	}
-	   );
+	   });
 
 	vecs[0].iov_base = rd;
 	vecs[0].iov_len = sizeof(*rd);
 	vecs[1].iov_base = (unsigned char *)name;
 	vecs[1].iov_len = namelen;
 
-	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
-
-	raw = jffs2_alloc_raw_node_ref();
-
-	if (!raw)
-		return ERR_PTR(-ENOMEM);
-
 	fd = jffs2_alloc_full_dirent(namelen+1);
-	if (!fd) {
-		jffs2_free_raw_node_ref(raw);
+	if (!fd)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	fd->version = je32_to_cpu(rd->version);
 	fd->ino = je32_to_cpu(rd->ino);
@@ -257,11 +228,9 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 	fd->name[namelen]=0;
 
  retry:
-	fd->raw = raw;
+	flash_ofs = write_ofs(c);
 
-	raw->flash_offset = flash_ofs;
-	raw->__totlen = PAD(sizeof(*rd)+namelen);
-	raw->next_phys = NULL;
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
 
 	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) {
 		BUG_ON(!retried);
@@ -280,15 +249,11 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 			       sizeof(*rd)+namelen, flash_ofs, ret, retlen);
 		/* Mark the space as dirtied */
 		if (retlen) {
-			raw->next_in_ino = NULL;
-			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw);
-			jffs2_mark_node_obsolete(c, raw);
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL);
 		} else {
-			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
-			jffs2_free_raw_node_ref(raw);
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
 		}
-		if (!retried && (raw = jffs2_alloc_raw_node_ref())) {
+		if (!retried) {
 			/* Try to reallocate space and retry */
 			uint32_t dummy;
 			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
@@ -301,39 +266,33 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 			jffs2_dbg_acct_paranoia_check(c, jeb);
 
 			if (alloc_mode == ALLOC_GC) {
-				ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &flash_ofs,
-							&dummy, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+				ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &dummy,
+							     JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 			} else {
 				/* Locking pain */
 				up(&f->sem);
 				jffs2_complete_reservation(c);
 
-				ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &flash_ofs,
-							&dummy, alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+				ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 				down(&f->sem);
 			}
 
 			if (!ret) {
+				flash_ofs = write_ofs(c);
 				D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
 				jffs2_dbg_acct_sanity_check(c,jeb);
 				jffs2_dbg_acct_paranoia_check(c, jeb);
 				goto retry;
 			}
 			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-			jffs2_free_raw_node_ref(raw);
 		}
 		/* Release the full_dnode which is now useless, and return */
 		jffs2_free_full_dirent(fd);
 		return ERR_PTR(ret?ret:-EIO);
 	}
 	/* Mark the space used */
-	raw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, raw);
-
-	spin_lock(&c->erase_completion_lock);
-	raw->next_in_ino = f->inocache->nodes;
-	f->inocache->nodes = raw;
-	spin_unlock(&c->erase_completion_lock);
+	fd->raw = jffs2_add_physical_node_ref(c, flash_ofs | REF_PRISTINE, PAD(sizeof(*rd)+namelen), f->inocache);
 
 	if (retried) {
 		jffs2_dbg_acct_sanity_check(c,NULL);
@@ -359,14 +318,14 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		struct jffs2_full_dnode *fn;
 		unsigned char *comprbuf = NULL;
 		uint16_t comprtype = JFFS2_COMPR_NONE;
-		uint32_t phys_ofs, alloclen;
+		uint32_t alloclen;
 		uint32_t datalen, cdatalen;
 		int retried = 0;
 
 	retry:
 		D2(printk(KERN_DEBUG "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", writelen, offset));
 
-		ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN, &phys_ofs,
+		ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN,
 					&alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 		if (ret) {
 			D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret));
@@ -394,7 +353,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 		ri->data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
 
-		fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, phys_ofs, ALLOC_NORETRY);
+		fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, ALLOC_NORETRY);
 
 		jffs2_free_comprbuf(comprbuf, buf);
 
@@ -448,13 +407,13 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 	struct jffs2_raw_dirent *rd;
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	/* Try to reserve enough space for both node and dirent.
 	 * Just the node will do for now, though
 	 */
-	ret = jffs2_reserve_space(c, sizeof(*ri), &phys_ofs, &alloclen, ALLOC_NORMAL,
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
 				JFFS2_SUMMARY_INODE_SIZE);
 	D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen));
 	if (ret) {
@@ -465,7 +424,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 	ri->data_crc = cpu_to_je32(0);
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
 
 	D1(printk(KERN_DEBUG "jffs2_do_create created file with mode 0x%x\n",
 		  jemode_to_cpu(ri->mode)));
@@ -484,7 +443,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 
 	up(&f->sem);
 	jffs2_complete_reservation(c);
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 
 	if (ret) {
@@ -516,7 +475,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
 
 	jffs2_free_raw_dirent(rd);
 
@@ -545,7 +504,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 {
 	struct jffs2_raw_dirent *rd;
 	struct jffs2_full_dirent *fd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	if (1 /* alternative branch needs testing */ ||
@@ -556,7 +515,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 		if (!rd)
 			return -ENOMEM;
 
-		ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+		ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 					ALLOC_DELETION, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 		if (ret) {
 			jffs2_free_raw_dirent(rd);
@@ -580,7 +539,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 		rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 		rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
 
-		fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_DELETION);
+		fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_DELETION);
 
 		jffs2_free_raw_dirent(rd);
 
@@ -659,14 +618,14 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint
 {
 	struct jffs2_raw_dirent *rd;
 	struct jffs2_full_dirent *fd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd)
 		return -ENOMEM;
 
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		jffs2_free_raw_dirent(rd);
@@ -692,7 +651,7 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
 
 	jffs2_free_raw_dirent(rd);
 
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
new file mode 100644
index 000000000000..2d82e250be34
--- /dev/null
+++ b/fs/jffs2/xattr.c
@@ -0,0 +1,1238 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+/* -------- xdatum related functions ----------------
+ * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
+ *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
+ *   the index of the xattr name/value pair cache (c->xattrindex).
+ * unload_xattr_datum(c, xd)
+ *   is used to release xattr name/value pair and detach from c->xattrindex.
+ * reclaim_xattr_datum(c)
+ *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   is hard coded as 32KiB.
+ * delete_xattr_datum_node(c, xd)
+ *   is used to delete a jffs2 node is dominated by xdatum. When EBS(Erase Block Summary) is
+ *   enabled, it overwrites the obsolete node by myself.
+ * delete_xattr_datum(c, xd)
+ *   is used to delete jffs2_xattr_datum object. It must be called with 0-value of reference
+ *   counter. (It means how many jffs2_xattr_ref object refers this xdatum.)
+ * do_verify_xattr_datum(c, xd)
+ *   is used to load the xdatum informations without name/value pair from the medium.
+ *   It's necessary once, because those informations are not collected during mounting
+ *   process when EBS is enabled.
+ *   0 will be returned, if success. An negative return value means recoverable error, and
+ *   positive return value means unrecoverable error. Thus, caller must remove this xdatum
+ *   and xref when it returned positive value.
+ * do_load_xattr_datum(c, xd)
+ *   is used to load name/value pair from the medium.
+ *   The meanings of return value is same as do_verify_xattr_datum().
+ * load_xattr_datum(c, xd)
+ *   is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum().
+ *   If xd need to call do_verify_xattr_datum() at first, it's called before calling
+ *   do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum().
+ * save_xattr_datum(c, xd)
+ *   is used to write xdatum to medium. xd->version will be incremented.
+ * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
+ *   is used to create new xdatum and write to medium.
+ * -------------------------------------------------- */
+
+static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
+{
+	int name_len = strlen(xname);
+
+	return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
+}
+
+static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	D1(dbg_xattr("%s: xid=%u, version=%u\n", __FUNCTION__, xd->xid, xd->version));
+	if (xd->xname) {
+		c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len);
+		kfree(xd->xname);
+	}
+
+	list_del_init(&xd->xindex);
+	xd->hashkey = 0;
+	xd->xname = NULL;
+	xd->xvalue = NULL;
+}
+
+static void reclaim_xattr_datum(struct jffs2_sb_info *c)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd, *_xd;
+	uint32_t target, before;
+	static int index = 0;
+	int count;
+
+	if (c->xdatum_mem_threshold > c->xdatum_mem_usage)
+		return;
+
+	before = c->xdatum_mem_usage;
+	target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */
+	for (count = 0; count < XATTRINDEX_HASHSIZE; count++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) {
+			if (xd->flags & JFFS2_XFLAGS_HOT) {
+				xd->flags &= ~JFFS2_XFLAGS_HOT;
+			} else if (!(xd->flags & JFFS2_XFLAGS_BIND)) {
+				unload_xattr_datum(c, xd);
+			}
+			if (c->xdatum_mem_usage <= target)
+				goto out;
+		}
+		index = (index+1) % XATTRINDEX_HASHSIZE;
+	}
+ out:
+	JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n",
+		     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
+}
+
+static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xattr rx;
+	size_t length;
+	int rc;
+
+	if (!xd->node) {
+		JFFS2_WARNING("xdatum (xid=%u) is removed twice.\n", xd->xid);
+		return;
+	}
+	if (jffs2_sum_active()) {
+		memset(&rx, 0xff, sizeof(struct jffs2_raw_xattr));
+		rc = jffs2_flash_read(c, ref_offset(xd->node),
+				      sizeof(struct jffs2_unknown_node),
+				      &length, (char *)&rx);
+		if (rc || length != sizeof(struct jffs2_unknown_node)) {
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
+				    rc, sizeof(struct jffs2_unknown_node),
+				    length, ref_offset(xd->node));
+		}
+		rc = jffs2_flash_write(c, ref_offset(xd->node), sizeof(rx),
+				       &length, (char *)&rx);
+		if (rc || length != sizeof(struct jffs2_raw_xattr)) {
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu ar %#08x\n",
+				    rc, sizeof(rx), length, ref_offset(xd->node));
+		}
+	}
+	spin_lock(&c->erase_completion_lock);
+	xd->node->next_in_ino = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_mark_node_obsolete(c, xd->node);
+	xd->node = NULL;
+}
+
+static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	BUG_ON(xd->refcnt);
+
+	unload_xattr_datum(c, xd);
+	if (xd->node) {
+		delete_xattr_datum_node(c, xd);
+		xd->node = NULL;
+	}
+	jffs2_free_xattr_datum(xd);
+}
+
+static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_xattr rx;
+	size_t readlen;
+	uint32_t crc, totlen;
+	int rc;
+
+	BUG_ON(!xd->node);
+	BUG_ON(ref_flags(xd->node) != REF_UNCHECKED);
+
+	rc = jffs2_flash_read(c, ref_offset(xd->node), sizeof(rx), &readlen, (char *)&rx);
+	if (rc || readlen != sizeof(rx)) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
+			      rc, sizeof(rx), readlen, ref_offset(xd->node));
+		return rc ? rc : -EIO;
+	}
+	crc = crc32(0, &rx, sizeof(rx) - 4);
+	if (crc != je32_to_cpu(rx.node_crc)) {
+		if (je32_to_cpu(rx.node_crc) != 0xffffffff)
+			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				    ref_offset(xd->node), je32_to_cpu(rx.hdr_crc), crc);
+		return EIO;
+	}
+	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
+	    || je32_to_cpu(rx.totlen) != totlen
+	    || je32_to_cpu(rx.xid) != xd->xid
+	    || je32_to_cpu(rx.version) != xd->version) {
+		JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
+			    ref_offset(xd->node), je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
+			    je32_to_cpu(rx.totlen), totlen,
+			    je32_to_cpu(rx.xid), xd->xid,
+			    je32_to_cpu(rx.version), xd->version);
+		return EIO;
+	}
+	xd->xprefix = rx.xprefix;
+	xd->name_len = rx.name_len;
+	xd->value_len = je16_to_cpu(rx.value_len);
+	xd->data_crc = je32_to_cpu(rx.data_crc);
+
+	/* This JFFS2_NODETYPE_XATTR node is checked */
+	jeb = &c->blocks[ref_offset(xd->node) / c->sector_size];
+	totlen = PAD(je32_to_cpu(rx.totlen));
+
+	spin_lock(&c->erase_completion_lock);
+	c->unchecked_size -= totlen; c->used_size += totlen;
+	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+	xd->node->flash_offset = ref_offset(xd->node) | REF_PRISTINE;
+	spin_unlock(&c->erase_completion_lock);
+
+	/* unchecked xdatum is chained with c->xattr_unchecked */
+	list_del_init(&xd->xindex);
+
+	dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+		  xd->xid, xd->version);
+
+	return 0;
+}
+
+static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	char *data;
+	size_t readlen;
+	uint32_t crc, length;
+	int i, ret, retry = 0;
+
+	BUG_ON(!xd->node);
+	BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
+	BUG_ON(!list_empty(&xd->xindex));
+ retry:
+	length = xd->name_len + 1 + xd->value_len;
+	data = kmalloc(length, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr),
+			       length, &readlen, data);
+
+	if (ret || length!=readlen) {
+		JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%zu, at %#08x\n",
+			      ret, length, readlen, ref_offset(xd->node));
+		kfree(data);
+		return ret ? ret : -EIO;
+	}
+
+	data[xd->name_len] = '\0';
+	crc = crc32(0, data, length);
+	if (crc != xd->data_crc) {
+		JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XREF)"
+			      " at %#08x, read: 0x%08x calculated: 0x%08x\n",
+			      ref_offset(xd->node), xd->data_crc, crc);
+		kfree(data);
+		return EIO;
+	}
+
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xname = data;
+	xd->xvalue = data + xd->name_len+1;
+
+	c->xdatum_mem_usage += length;
+
+	xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len);
+	i = xd->hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+	if (!retry) {
+		retry = 1;
+		reclaim_xattr_datum(c);
+		if (!xd->xname)
+			goto retry;
+	}
+
+	dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem);
+	 * rc < 0 : recoverable error, try again
+	 * rc = 0 : success
+	 * rc > 0 : Unrecoverable error, this node should be deleted.
+	 */
+	int rc = 0;
+	BUG_ON(xd->xname);
+	if (!xd->node)
+		return EIO;
+	if (unlikely(ref_flags(xd->node) != REF_PRISTINE)) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc > 0) {
+			list_del_init(&xd->xindex);
+			delete_xattr_datum_node(c, xd);
+		}
+	}
+	if (!rc)
+		rc = do_load_xattr_datum(c, xd);
+	return rc;
+}
+
+static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xattr rx;
+	struct kvec vecs[2];
+	size_t length;
+	int rc, totlen;
+	uint32_t phys_ofs = write_ofs(c);
+
+	BUG_ON(!xd->xname);
+
+	vecs[0].iov_base = &rx;
+	vecs[0].iov_len = PAD(sizeof(rx));
+	vecs[1].iov_base = xd->xname;
+	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+	totlen = vecs[0].iov_len + vecs[1].iov_len;
+
+	/* Setup raw-xattr */
+	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
+	rx.totlen = cpu_to_je32(PAD(totlen));
+	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
+
+	rx.xid = cpu_to_je32(xd->xid);
+	rx.version = cpu_to_je32(++xd->version);
+	rx.xprefix = xd->xprefix;
+	rx.name_len = xd->name_len;
+	rx.value_len = cpu_to_je16(xd->value_len);
+	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
+
+	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+	if (rc || totlen != length) {
+		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
+			      rc, totlen, length, phys_ofs);
+		rc = rc ? rc : -EIO;
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(totlen), NULL);
+
+		return rc;
+	}
+
+	/* success */
+	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), NULL);
+	/* FIXME */ raw->next_in_ino = (void *)xd;
+
+	if (xd->node)
+		delete_xattr_datum_node(c, xd);
+	xd->node = raw;
+
+	dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->version, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
+						    int xprefix, const char *xname,
+						    const char *xvalue, int xsize)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+	uint32_t hashkey, name_len;
+	char *data;
+	int i, rc;
+
+	/* Search xattr_datum has same xname/xvalue by index */
+	hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize);
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->hashkey==hashkey
+		    && xd->xprefix==xprefix
+		    && xd->value_len==xsize
+		    && !strcmp(xd->xname, xname)
+		    && !memcmp(xd->xvalue, xvalue, xsize)) {
+			xd->refcnt++;
+			return xd;
+		}
+	}
+
+	/* Not found, Create NEW XATTR-Cache */
+	name_len = strlen(xname);
+
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+
+	data = kmalloc(name_len + 1 + xsize, GFP_KERNEL);
+	if (!data) {
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(-ENOMEM);
+	}
+	strcpy(data, xname);
+	memcpy(data + name_len + 1, xvalue, xsize);
+
+	xd->refcnt = 1;
+	xd->xid = ++c->highest_xid;
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xprefix = xprefix;
+
+	xd->hashkey = hashkey;
+	xd->xname = data;
+	xd->xvalue = data + name_len + 1;
+	xd->name_len = name_len;
+	xd->value_len = xsize;
+	xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len);
+
+	rc = save_xattr_datum(c, xd);
+	if (rc) {
+		kfree(xd->xname);
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(rc);
+	}
+
+	/* Insert Hash Index */
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+
+	c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len);
+	reclaim_xattr_datum(c);
+
+	return xd;
+}
+
+/* -------- xref related functions ------------------
+ * verify_xattr_ref(c, ref)
+ *   is used to load xref information from medium. Because summary data does not
+ *   contain xid/ino, it's necessary to verify once while mounting process.
+ * delete_xattr_ref_node(c, ref)
+ *   is used to delete a jffs2 node is dominated by xref. When EBS is enabled,
+ *   it overwrites the obsolete node by myself. 
+ * delete_xattr_ref(c, ref)
+ *   is used to delete jffs2_xattr_ref object. If the reference counter of xdatum
+ *   is refered by this xref become 0, delete_xattr_datum() is called later.
+ * save_xattr_ref(c, ref)
+ *   is used to write xref to medium.
+ * create_xattr_ref(c, ic, xd)
+ *   is used to create a new xref and write to medium.
+ * jffs2_xattr_delete_inode(c, ic)
+ *   is called to remove xrefs related to obsolete inode when inode is unlinked.
+ * jffs2_xattr_free_inode(c, ic)
+ *   is called to release xattr related objects when unmounting. 
+ * check_xattr_ref_inode(c, ic)
+ *   is used to confirm inode does not have duplicate xattr name/value pair.
+ * -------------------------------------------------- */
+static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_xref rr;
+	size_t readlen;
+	uint32_t crc, totlen;
+	int rc;
+
+	BUG_ON(ref_flags(ref->node) != REF_UNCHECKED);
+
+	rc = jffs2_flash_read(c, ref_offset(ref->node), sizeof(rr), &readlen, (char *)&rr);
+	if (rc || sizeof(rr) != readlen) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n",
+			      rc, sizeof(rr), readlen, ref_offset(ref->node));
+		return rc ? rc : -EIO;
+	}
+	/* obsolete node */
+	crc = crc32(0, &rr, sizeof(rr) - 4);
+	if (crc != je32_to_cpu(rr.node_crc)) {
+		if (je32_to_cpu(rr.node_crc) != 0xffffffff)
+			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				    ref_offset(ref->node), je32_to_cpu(rr.node_crc), crc);
+		return EIO;
+	}
+	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
+	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
+		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%zu\n",
+			    ref_offset(ref->node), je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
+			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
+		return EIO;
+	}
+	ref->ino = je32_to_cpu(rr.ino);
+	ref->xid = je32_to_cpu(rr.xid);
+
+	/* fixup superblock/eraseblock info */
+	jeb = &c->blocks[ref_offset(ref->node) / c->sector_size];
+	totlen = PAD(sizeof(rr));
+
+	spin_lock(&c->erase_completion_lock);
+	c->unchecked_size -= totlen; c->used_size += totlen;
+	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+	ref->node->flash_offset = ref_offset(ref->node) | REF_PRISTINE;
+	spin_unlock(&c->erase_completion_lock);
+
+	dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
+		  ref->ino, ref->xid, ref_offset(ref->node));
+	return 0;
+}
+
+static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_raw_xref rr;
+	size_t length;
+	int rc;
+
+	if (jffs2_sum_active()) {
+		memset(&rr, 0xff, sizeof(rr));
+		rc = jffs2_flash_read(c, ref_offset(ref->node),
+				      sizeof(struct jffs2_unknown_node),
+				      &length, (char *)&rr);
+		if (rc || length != sizeof(struct jffs2_unknown_node)) {
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
+				    rc, sizeof(struct jffs2_unknown_node),
+				    length, ref_offset(ref->node));
+		}
+		rc = jffs2_flash_write(c, ref_offset(ref->node), sizeof(rr),
+				       &length, (char *)&rr);
+		if (rc || length != sizeof(struct jffs2_raw_xref)) {
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu at %#08x\n",
+				    rc, sizeof(rr), length, ref_offset(ref->node));
+		}
+	}
+	spin_lock(&c->erase_completion_lock);
+	ref->node->next_in_ino = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_mark_node_obsolete(c, ref->node);
+	ref->node = NULL;
+}
+
+static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+
+	BUG_ON(!ref->node);
+	delete_xattr_ref_node(c, ref);
+
+	xd = ref->xd;
+	xd->refcnt--;
+	if (!xd->refcnt)
+		delete_xattr_datum(c, xd);
+	jffs2_free_xattr_ref(ref);
+}
+
+static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xref rr;
+	size_t length;
+	uint32_t phys_ofs = write_ofs(c);
+	int ret;
+
+	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
+	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
+	rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
+
+	rr.ino = cpu_to_je32(ref->ic->ino);
+	rr.xid = cpu_to_je32(ref->xd->xid);
+	rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
+
+	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
+	if (ret || sizeof(rr) != length) {
+		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%zu, retlen=%zu, at %#08x\n",
+			      ret, sizeof(rr), length, phys_ofs);
+		ret = ret ? ret : -EIO;
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(sizeof(rr)), NULL);
+
+		return ret;
+	}
+
+	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), NULL);
+	/* FIXME */ raw->next_in_ino = (void *)ref;
+	if (ref->node)
+		delete_xattr_ref_node(c, ref);
+	ref->node = raw;
+
+	dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
+
+	return 0;
+}
+
+static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic,
+						struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_ref *ref;
+	int ret;
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return ERR_PTR(-ENOMEM);
+	ref->ic = ic;
+	ref->xd = xd;
+
+	ret = save_xattr_ref(c, ref);
+	if (ret) {
+		jffs2_free_xattr_ref(ref);
+		return ERR_PTR(ret);
+	}
+
+	/* Chain to inode */
+	ref->next = ic->xref;
+	ic->xref = ref;
+
+	return ref; /* success */
+}
+
+void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_clear_inode() on inode removing.
+	   When an inode with XATTR is removed, those XATTRs must be removed. */
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	if (!ic || ic->nlink > 0)
+		return;
+
+	down_write(&c->xattr_sem);
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
+		delete_xattr_ref(c, ref);
+	}
+	ic->xref = NULL;
+	up_write(&c->xattr_sem);
+}
+
+void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_free_ino_caches() until unmounting FS. */
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	down_write(&c->xattr_sem);
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
+		xd = ref->xd;
+		xd->refcnt--;
+		if (!xd->refcnt) {
+			unload_xattr_datum(c, xd);
+			jffs2_free_xattr_datum(xd);
+		}
+		jffs2_free_xattr_ref(ref);
+	}
+	ic->xref = NULL;
+	up_write(&c->xattr_sem);
+}
+
+static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+	 * duplicate name/value pairs. If duplicate name/value pair would be found,
+	 * one will be removed.
+	 */
+	struct jffs2_xattr_ref *ref, *cmp, **pref;
+	int rc = 0;
+
+	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
+		return 0;
+	down_write(&c->xattr_sem);
+ retry:
+	rc = 0;
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		if (!ref->xd->xname) {
+			rc = load_xattr_datum(c, ref->xd);
+			if (unlikely(rc > 0)) {
+				*pref = ref->next;
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		for (cmp=ref->next, pref=&ref->next; cmp; pref=&cmp->next, cmp=cmp->next) {
+			if (!cmp->xd->xname) {
+				ref->xd->flags |= JFFS2_XFLAGS_BIND;
+				rc = load_xattr_datum(c, cmp->xd);
+				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
+				if (unlikely(rc > 0)) {
+					*pref = cmp->next;
+					delete_xattr_ref(c, cmp);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+			if (ref->xd->xprefix == cmp->xd->xprefix
+			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
+				*pref = cmp->next;
+				delete_xattr_ref(c, cmp);
+				goto retry;
+			}
+		}
+	}
+	ic->flags |= INO_FLAGS_XATTR_CHECKED;
+ out:
+	up_write(&c->xattr_sem);
+
+	return rc;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * jffs2_init_xattr_subsystem(c)
+ *   is used to initialize semaphore and list_head, and some variables.
+ * jffs2_find_xattr_datum(c, xid)
+ *   is used to lookup xdatum while scanning process.
+ * jffs2_clear_xattr_subsystem(c)
+ *   is used to release any xattr related objects.
+ * jffs2_build_xattr_subsystem(c)
+ *   is used to associate xdatum and xref while super block building process.
+ * jffs2_setup_xattr_datum(c, xid, version)
+ *   is used to insert xdatum while scanning process.
+ * -------------------------------------------------- */
+void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	int i;
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
+		INIT_LIST_HEAD(&c->xattrindex[i]);
+	INIT_LIST_HEAD(&c->xattr_unchecked);
+	c->xref_temp = NULL;
+
+	init_rwsem(&c->xattr_sem);
+	c->xdatum_mem_usage = 0;
+	c->xdatum_mem_threshold = 32 * 1024;	/* Default 32KB */
+}
+
+static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid)
+{
+	struct jffs2_xattr_datum *xd;
+	int i = xid % XATTRINDEX_HASHSIZE;
+
+	/* It's only used in scanning/building process. */
+	BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING)));
+
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->xid==xid)
+			return xd;
+	}
+	return NULL;
+}
+
+void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+	int i;
+
+	for (ref=c->xref_temp; ref; ref = _ref) {
+		_ref = ref->next;
+		jffs2_free_xattr_ref(ref);
+	}
+	c->xref_temp = NULL;
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del(&xd->xindex);
+			if (xd->xname)
+				kfree(xd->xname);
+			jffs2_free_xattr_datum(xd);
+		}
+	}
+}
+
+void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_inode_cache *ic;
+	int i, xdatum_count =0, xdatum_unchecked_count = 0, xref_count = 0;
+
+	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+
+	/* Phase.1 */
+	for (ref=c->xref_temp; ref; ref=_ref) {
+		_ref = ref->next;
+		/* checking REF_UNCHECKED nodes */
+		if (ref_flags(ref->node) != REF_PRISTINE) {
+			if (verify_xattr_ref(c, ref)) {
+				delete_xattr_ref_node(c, ref);
+				jffs2_free_xattr_ref(ref);
+				continue;
+			}
+		}
+		/* At this point, ref->xid and ref->ino contain XID and inode number.
+		   ref->xd and ref->ic are not valid yet. */
+		xd = jffs2_find_xattr_datum(c, ref->xid);
+		ic = jffs2_get_ino_cache(c, ref->ino);
+		if (!xd || !ic) {
+			if (ref_flags(ref->node) != REF_UNCHECKED)
+				JFFS2_WARNING("xref(ino=%u, xid=%u) is orphan. \n",
+					      ref->ino, ref->xid);
+			delete_xattr_ref_node(c, ref);
+			jffs2_free_xattr_ref(ref);
+			continue;
+		}
+		ref->xd = xd;
+		ref->ic = ic;
+		xd->refcnt++;
+		ref->next = ic->xref;
+		ic->xref = ref;
+		xref_count++;
+	}
+	c->xref_temp = NULL;
+	/* After this, ref->xid/ino are NEVER used. */
+
+	/* Phase.2 */
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del_init(&xd->xindex);
+			if (!xd->refcnt) {
+				if (ref_flags(xd->node) != REF_UNCHECKED)
+					JFFS2_WARNING("orphan xdatum(xid=%u, version=%u) at %#08x\n",
+						      xd->xid, xd->version, ref_offset(xd->node));
+				delete_xattr_datum(c, xd);
+				continue;
+			}
+			if (ref_flags(xd->node) != REF_PRISTINE) {
+				dbg_xattr("unchecked xdatum(xid=%u) at %#08x\n",
+					  xd->xid, ref_offset(xd->node));
+				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_unchecked_count++;
+			}
+			xdatum_count++;
+		}
+	}
+	/* build complete */
+	JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum (%u unchecked) and "
+		     "%u of xref found.\n", xdatum_count, xdatum_unchecked_count, xref_count);
+}
+
+struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+						  uint32_t xid, uint32_t version)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+
+	_xd = jffs2_find_xattr_datum(c, xid);
+	if (_xd) {
+		dbg_xattr("duplicate xdatum (xid=%u, version=%u/%u) at %#08x\n",
+			  xid, version, _xd->version, ref_offset(_xd->node));
+		if (version < _xd->version)
+			return ERR_PTR(-EEXIST);
+	}
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+	xd->xid = xid;
+	xd->version = version;
+	if (xd->xid > c->highest_xid)
+		c->highest_xid = xd->xid;
+	list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
+
+	if (_xd) {
+		list_del_init(&_xd->xindex);
+		delete_xattr_datum_node(c, _xd);
+		jffs2_free_xattr_datum(_xd);
+	}
+	return xd;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * xprefix_to_handler(xprefix)
+ *   is used to translate xprefix into xattr_handler.
+ * jffs2_listxattr(dentry, buffer, size)
+ *   is an implementation of listxattr handler on jffs2.
+ * do_jffs2_getxattr(inode, xprefix, xname, buffer, size)
+ *   is an implementation of getxattr handler on jffs2.
+ * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
+ *   is an implementation of setxattr handler on jffs2.
+ * -------------------------------------------------- */
+struct xattr_handler *jffs2_xattr_handlers[] = {
+	&jffs2_user_xattr_handler,
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	&jffs2_security_xattr_handler,
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	&jffs2_acl_access_xattr_handler,
+	&jffs2_acl_default_xattr_handler,
+#endif
+	&jffs2_trusted_xattr_handler,
+	NULL
+};
+
+static struct xattr_handler *xprefix_to_handler(int xprefix) {
+	struct xattr_handler *ret;
+
+	switch (xprefix) {
+	case JFFS2_XPREFIX_USER:
+		ret = &jffs2_user_xattr_handler;
+		break;
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	case JFFS2_XPREFIX_SECURITY:
+		ret = &jffs2_security_xattr_handler;
+		break;
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	case JFFS2_XPREFIX_ACL_ACCESS:
+		ret = &jffs2_acl_access_xattr_handler;
+		break;
+	case JFFS2_XPREFIX_ACL_DEFAULT:
+		ret = &jffs2_acl_default_xattr_handler;
+		break;
+#endif
+	case JFFS2_XPREFIX_TRUSTED:
+		ret = &jffs2_trusted_xattr_handler;
+		break;
+	default:
+		ret = NULL;
+		break;
+	}
+	return ret;
+}
+
+ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_ref *ref, **pref;
+	struct jffs2_xattr_datum *xd;
+	struct xattr_handler *xhandle;
+	ssize_t len, rc;
+	int retry = 0;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	len = 0;
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		BUG_ON(ref->ic != ic);
+		xd = ref->xd;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+		}
+		xhandle = xprefix_to_handler(xd->xprefix);
+		if (!xhandle)
+			continue;
+		if (buffer) {
+			rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len);
+		} else {
+			rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len);
+		}
+		if (rc < 0)
+			goto out;
+		len += rc;
+	}
+	rc = len;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+		      char *buffer, size_t size)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, **pref;
+	int rc, retry = 0;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		BUG_ON(ref->ic!=ic);
+
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					*pref = ref->next;
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0)) {
+					goto out;
+				}
+			}
+		}
+		if (!strcmp(xname, xd->xname)) {
+			rc = xd->value_len;
+			if (buffer) {
+				if (size < rc) {
+					rc = -ERANGE;
+				} else {
+					memcpy(buffer, xd->xvalue, rc);
+				}
+			}
+			goto out;
+		}
+	}
+	rc = -ENODATA;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+		      const char *buffer, size_t size, int flags)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *newref, **pref;
+	uint32_t length, request;
+	int rc;
+
+	rc = check_xattr_ref_inode(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
+	rc = jffs2_reserve_space(c, request, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		return rc;
+	}
+
+	/* Find existing xattr */
+	down_write(&c->xattr_sem);
+ retry:
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			rc = load_xattr_datum(c, xd);
+			if (unlikely(rc > 0)) {
+				*pref = ref->next;
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		if (!strcmp(xd->xname, xname)) {
+			if (flags & XATTR_CREATE) {
+				rc = -EEXIST;
+				goto out;
+			}
+			if (!buffer) {
+				*pref = ref->next;
+				delete_xattr_ref(c, ref);
+				rc = 0;
+				goto out;
+			}
+			goto found;
+		}
+	}
+	/* not found */
+	if (flags & XATTR_REPLACE) {
+		rc = -ENODATA;
+		goto out;
+	}
+	if (!buffer) {
+		rc = -EINVAL;
+		goto out;
+	}
+ found:
+	xd = create_xattr_datum(c, xprefix, xname, buffer, size);
+	if (IS_ERR(xd)) {
+		rc = PTR_ERR(xd);
+		goto out;
+	}
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+
+	/* create xattr_ref */
+	request = PAD(sizeof(struct jffs2_raw_xref));
+	rc = jffs2_reserve_space(c, request, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		down_write(&c->xattr_sem);
+		xd->refcnt--;
+		if (!xd->refcnt)
+			delete_xattr_datum(c, xd);
+		up_write(&c->xattr_sem);
+		return rc;
+	}
+	down_write(&c->xattr_sem);
+	if (ref)
+		*pref = ref->next;
+	newref = create_xattr_ref(c, ic, xd);
+	if (IS_ERR(newref)) {
+		if (ref) {
+			ref->next = ic->xref;
+			ic->xref = ref;
+		}
+		rc = PTR_ERR(newref);
+		xd->refcnt--;
+		if (!xd->refcnt)
+			delete_xattr_datum(c, xd);
+	} else if (ref) {
+		delete_xattr_ref(c, ref);
+	}
+ out:
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+	return rc;
+}
+
+/* -------- garbage collector functions -------------
+ * jffs2_garbage_collect_xattr_datum(c, xd)
+ *   is used to move xdatum into new node.
+ * jffs2_garbage_collect_xattr_ref(c, ref)
+ *   is used to move xref into new node.
+ * jffs2_verify_xattr(c)
+ *   is used to call do_verify_xattr_datum() before garbage collecting.
+ * -------------------------------------------------- */
+int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	uint32_t totlen, length, old_ofs;
+	int rc = -EINVAL;
+
+	down_write(&c->xattr_sem);
+	BUG_ON(!xd->node);
+
+	old_ofs = ref_offset(xd->node);
+	totlen = ref_totlen(c, c->gcblock, xd->node);
+	if (totlen < sizeof(struct jffs2_raw_xattr))
+		goto out;
+
+	if (!xd->xname) {
+		rc = load_xattr_datum(c, xd);
+		if (unlikely(rc > 0)) {
+			delete_xattr_datum_node(c, xd);
+			rc = 0;
+			goto out;
+		} else if (unlikely(rc < 0))
+			goto out;
+	}
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc || length < totlen) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen);
+		rc = rc ? rc : -EBADFD;
+		goto out;
+	}
+	rc = save_xattr_datum(c, xd);
+	if (!rc)
+		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
+			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
+ out:
+	up_write(&c->xattr_sem);
+	return rc;
+}
+
+
+int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	uint32_t totlen, length, old_ofs;
+	int rc = -EINVAL;
+
+	down_write(&c->xattr_sem);
+	BUG_ON(!ref->node);
+
+	old_ofs = ref_offset(ref->node);
+	totlen = ref_totlen(c, c->gcblock, ref->node);
+	if (totlen != sizeof(struct jffs2_raw_xref))
+		goto out;
+
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc || length < totlen) {
+		JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n",
+			      __FUNCTION__, rc, totlen);
+		rc = rc ? rc : -EBADFD;
+		goto out;
+	}
+	rc = save_xattr_ref(c, ref);
+	if (!rc)
+		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
+			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
+ out:
+	up_write(&c->xattr_sem);
+	return rc;
+}
+
+int jffs2_verify_xattr(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	int rc;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc == 0) {
+			list_del_init(&xd->xindex);
+			break;
+		} else if (rc > 0) {
+			list_del_init(&xd->xindex);
+			delete_xattr_datum_node(c, xd);
+		}
+	}
+	up_write(&c->xattr_sem);
+
+	return list_empty(&c->xattr_unchecked) ? 1 : 0;
+}
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
new file mode 100644
index 000000000000..2c199856c582
--- /dev/null
+++ b/fs/jffs2/xattr.h
@@ -0,0 +1,116 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#ifndef _JFFS2_FS_XATTR_H_
+#define _JFFS2_FS_XATTR_H_
+
+#include <linux/xattr.h>
+#include <linux/list.h>
+
+#define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
+#define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+
+struct jffs2_xattr_datum
+{
+	void *always_null;
+	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;
+	uint16_t xprefix;			/* see JFFS2_XATTR_PREFIX_* */
+
+	struct list_head xindex;	/* chained from c->xattrindex[n] */
+	uint32_t refcnt;		/* # of xattr_ref refers this */
+	uint32_t xid;
+	uint32_t version;
+
+	uint32_t data_crc;
+	uint32_t hashkey;
+	char *xname;		/* XATTR name without prefix */
+	uint32_t name_len;	/* length of xname */
+	char *xvalue;		/* XATTR value */
+	uint32_t value_len;	/* length of xvalue */
+};
+
+struct jffs2_inode_cache;
+struct jffs2_xattr_ref
+{
+	void *always_null;
+	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;		/* Currently unused */
+	u16 unused;
+
+	union {
+		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
+		uint32_t ino;			/* only used in scanning/building  */
+	};
+	union {
+		struct jffs2_xattr_datum *xd;	/* reference to jffs2_xattr_datum */
+		uint32_t xid;			/* only used in sccanning/building */
+	};
+	struct jffs2_xattr_ref *next;		/* chained from ic->xref_list */
+};
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+
+extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
+
+extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+                                                  uint32_t xid, uint32_t version);
+
+extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+
+extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
+extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
+extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+
+extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+			     char *buffer, size_t size);
+extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+			     const char *buffer, size_t size, int flags);
+
+extern struct xattr_handler *jffs2_xattr_handlers[];
+extern struct xattr_handler jffs2_user_xattr_handler;
+extern struct xattr_handler jffs2_trusted_xattr_handler;
+
+extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
+#define jffs2_getxattr		generic_getxattr
+#define jffs2_setxattr		generic_setxattr
+#define jffs2_removexattr	generic_removexattr
+
+#else
+
+#define jffs2_init_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c)
+#define jffs2_clear_xattr_subsystem(c)
+
+#define jffs2_xattr_delete_inode(c, ic)
+#define jffs2_xattr_free_inode(c, ic)
+#define jffs2_verify_xattr(c)			(1)
+
+#define jffs2_xattr_handlers	NULL
+#define jffs2_listxattr		NULL
+#define jffs2_getxattr		NULL
+#define jffs2_setxattr		NULL
+#define jffs2_removexattr	NULL
+
+#endif /* CONFIG_JFFS2_FS_XATTR */
+
+#ifdef CONFIG_JFFS2_FS_SECURITY
+extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern struct xattr_handler jffs2_security_xattr_handler;
+#else
+#define jffs2_init_security(inode,dir)	(0)
+#endif /* CONFIG_JFFS2_FS_SECURITY */
+
+#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
new file mode 100644
index 000000000000..ed046e19dbfa
--- /dev/null
+++ b/fs/jffs2/xattr_trusted.c
@@ -0,0 +1,52 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_trusted_getxattr(struct inode *inode, const char *name,
+				  void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size);
+}
+
+static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer,
+				  size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags);
+}
+
+static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size,
+				      const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen<=list_size) {
+		strcpy(list, XATTR_TRUSTED_PREFIX);
+		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.list = jffs2_trusted_listxattr,
+	.set = jffs2_trusted_setxattr,
+	.get = jffs2_trusted_getxattr
+};
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
new file mode 100644
index 000000000000..2f8e9aa01ea0
--- /dev/null
+++ b/fs/jffs2/xattr_user.c
@@ -0,0 +1,52 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_user_getxattr(struct inode *inode, const char *name,
+                               void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size);
+}
+
+static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer,
+                               size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags);
+}
+
+static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size,
+				   const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_USER_PREFIX);
+		strcpy(list + XATTR_USER_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.list = jffs2_user_listxattr,
+	.set = jffs2_user_setxattr,
+	.get = jffs2_user_getxattr
+};
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2b220dd6b4e7..7f6e88039700 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -632,10 +632,9 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 		}
 		SetPageUptodate(page);
 	} else {
-		page = read_cache_page(mapping, page_index,
-			    (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, page_index, NULL);
 		if (IS_ERR(page) || !PageUptodate(page)) {
-			jfs_err("read_cache_page failed!");
+			jfs_err("read_mapping_page failed!");
 			return NULL;
 		}
 		lock_page(page);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index db6f41d6dd60..73d2aba084c6 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -139,9 +139,9 @@ static void jfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(jfs_inode_cachep, ji);
 }
 
-static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
 	s64 maxinodes;
 	struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
 
@@ -565,10 +565,11 @@ static void jfs_unlockfs(struct super_block *sb)
 	}
 }
 
-static struct super_block *jfs_get_sb(struct file_system_type *fs_type, 
-	int flags, const char *dev_name, void *data)
+static int jfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+			   mnt);
 }
 
 static int jfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d0..fc785d8befb9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,9 +20,9 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
 	return 0;
@@ -196,9 +196,9 @@ struct inode_operations simple_dir_inode_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct super_block *
-get_sb_pseudo(struct file_system_type *fs_type, char *name,
-	struct super_operations *ops, unsigned long magic)
+int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+	struct super_operations *ops, unsigned long magic,
+	struct vfsmount *mnt)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	static struct super_operations default_ops = {.statfs = simple_statfs};
@@ -207,7 +207,7 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct qstr d_name = {.name = name, .len = strlen(name)};
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = ~0ULL;
@@ -232,12 +232,12 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 
 Enomem:
 	up_write(&s->s_umount);
 	deactivate_super(s);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -424,13 +424,13 @@ out:
 
 static DEFINE_SPINLOCK(pin_fs_lock);
 
-int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
 {
 	struct vfsmount *mnt = NULL;
 	spin_lock(&pin_fs_lock);
 	if (unlikely(!*mount)) {
 		spin_unlock(&pin_fs_lock);
-		mnt = do_kern_mount(name, 0, name, NULL);
+		mnt = vfs_kern_mount(type, 0, type->name, NULL);
 		if (IS_ERR(mnt))
 			return PTR_ERR(mnt);
 		spin_lock(&pin_fs_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index bce744468708..52774feab93f 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
  * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
  * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
  */
-static inline
-void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
+static void nlmclnt_prepare_reclaim(struct nlm_host *host)
 {
+	down_write(&host->h_rwsem);
 	host->h_monitored = 0;
-	host->h_nsmstate = newstate;
 	host->h_state++;
 	host->h_nextrebind = 0;
 	nlm_rebind_host(host);
@@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 	dprintk("NLM: reclaiming locks for host %s", host->h_name);
 }
 
+static void nlmclnt_finish_reclaim(struct nlm_host *host)
+{
+	host->h_reclaiming = 0;
+	up_write(&host->h_rwsem);
+	dprintk("NLM: done reclaiming locks for host %s", host->h_name);
+}
+
 /*
  * Reclaim all locks on server host. We do this by spawning a separate
  * reclaimer thread.
@@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 void
 nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 {
-	if (host->h_reclaiming++) {
-		if (host->h_nsmstate == newstate)
-			return;
-		nlmclnt_prepare_reclaim(host, newstate);
-	} else {
-		nlmclnt_prepare_reclaim(host, newstate);
+	if (host->h_nsmstate == newstate)
+		return;
+	host->h_nsmstate = newstate;
+	if (!host->h_reclaiming++) {
 		nlm_get_host(host);
 		__module_get(THIS_MODULE);
 		if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
@@ -190,6 +194,7 @@ reclaimer(void *ptr)
 	struct nlm_host	  *host = (struct nlm_host *) ptr;
 	struct nlm_wait	  *block;
 	struct file_lock *fl, *next;
+	u32 nsmstate;
 
 	daemonize("%s-reclaim", host->h_name);
 	allow_signal(SIGKILL);
@@ -199,19 +204,25 @@ reclaimer(void *ptr)
 	lock_kernel();
 	lockd_up();
 
+	nlmclnt_prepare_reclaim(host);
 	/* First, reclaim all locks that have been marked. */
 restart:
+	nsmstate = host->h_nsmstate;
 	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
 		list_del_init(&fl->fl_u.nfs_fl.list);
 
 		if (signalled())
 			continue;
-		if (nlmclnt_reclaim(host, fl) == 0)
-			list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
-		goto restart;
+		if (nlmclnt_reclaim(host, fl) != 0)
+			continue;
+		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+		if (host->h_nsmstate != nsmstate) {
+			/* Argh! The server rebooted again! */
+			list_splice_init(&host->h_granted, &host->h_reclaim);
+			goto restart;
+		}
 	}
-
-	host->h_reclaiming = 0;
+	nlmclnt_finish_reclaim(host);
 
 	/* Now, wake up all processes that sleep on a blocked lock */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e38155b5c..4db62098d3f4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -508,7 +508,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	block = nlmclnt_prepare_block(host, fl);
+again:
 	for(;;) {
+		/* Reboot protection */
+		fl->fl_u.nfs_fl.state = host->h_state;
 		status = nlmclnt_call(req, NLMPROC_LOCK);
 		if (status < 0)
 			goto out_unblock;
@@ -531,10 +534,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	if (resp->status == NLM_LCK_GRANTED) {
-		fl->fl_u.nfs_fl.state = host->h_state;
+		down_read(&host->h_rwsem);
+		/* Check whether or not the server has rebooted */
+		if (fl->fl_u.nfs_fl.state != host->h_state) {
+			up_read(&host->h_rwsem);
+			goto again;
+		}
 		fl->fl_flags |= FL_SLEEP;
 		/* Ensure the resulting lock will get added to granted list */
 		do_vfs_lock(fl);
+		up_read(&host->h_rwsem);
 	}
 	status = nlm_stat_to_errno(resp->status);
 out_unblock:
@@ -596,6 +605,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
 static int
 nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 {
+	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	int		status;
 
@@ -604,7 +614,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
 	 * case, we want to unlock.
 	 */
+	down_read(&host->h_rwsem);
 	do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
 
 	if (req->a_flags & RPC_TASK_ASYNC)
 		return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 729ac427d359..38b0e8a1aec0 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,11 +112,12 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	host->h_version    = version;
 	host->h_proto      = proto;
 	host->h_rpcclnt    = NULL;
-	init_MUTEX(&host->h_sema);
+	mutex_init(&host->h_mutex);
 	host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
 	atomic_set(&host->h_count, 1);
 	init_waitqueue_head(&host->h_gracewait);
+	init_rwsem(&host->h_rwsem);
 	host->h_state      = 0;			/* pseudo NSM state */
 	host->h_nsmstate   = 0;			/* real NSM state */
 	host->h_server	   = server;
@@ -172,7 +173,7 @@ nlm_bind_host(struct nlm_host *host)
 			(unsigned)ntohl(host->h_addr.sin_addr.s_addr));
 
 	/* Lock host handle */
-	down(&host->h_sema);
+	mutex_lock(&host->h_mutex);
 
 	/* If we've already created an RPC client, check whether
 	 * RPC rebind is required
@@ -204,12 +205,12 @@ nlm_bind_host(struct nlm_host *host)
 		host->h_rpcclnt = clnt;
 	}
 
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return clnt;
 
 forgetit:
 	printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return NULL;
 }
 
diff --git a/fs/locks.c b/fs/locks.c
index 6f99c0a6f836..1ad29c9b6252 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -703,7 +703,7 @@ EXPORT_SYMBOL(posix_test_lock);
  * from a broken NFS client. But broken NFS clients have a lot more to
  * worry about than proper deadlock detection anyway... --okir
  */
-int posix_locks_deadlock(struct file_lock *caller_fl,
+static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
 	struct list_head *tmp;
@@ -722,8 +722,6 @@ next_task:
 	return 0;
 }
 
-EXPORT_SYMBOL(posix_locks_deadlock);
-
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
  * at the head of the list, but that's secret knowledge known only to
  * flock_lock_file and posix_lock_file.
@@ -755,6 +753,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	if (request->fl_type == F_UNLCK)
 		goto out;
 
+	error = -ENOMEM;
 	new_fl = locks_alloc_lock();
 	if (new_fl == NULL)
 		goto out;
@@ -781,6 +780,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	locks_copy_lock(new_fl, request);
 	locks_insert_lock(&inode->i_flock, new_fl);
 	new_fl = NULL;
+	error = 0;
 
 out:
 	unlock_kernel();
@@ -792,7 +792,8 @@ out:
 static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
 	struct file_lock *fl;
-	struct file_lock *new_fl, *new_fl2;
+	struct file_lock *new_fl = NULL;
+	struct file_lock *new_fl2 = NULL;
 	struct file_lock *left = NULL;
 	struct file_lock *right = NULL;
 	struct file_lock **before;
@@ -801,9 +802,15 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
 	/*
 	 * We may need two file_lock structures for this operation,
 	 * so we get them in advance to avoid races.
+	 *
+	 * In some cases we can be sure, that no new locks will be needed
 	 */
-	new_fl = locks_alloc_lock();
-	new_fl2 = locks_alloc_lock();
+	if (!(request->fl_flags & FL_ACCESS) &&
+	    (request->fl_type != F_UNLCK ||
+	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
+		new_fl = locks_alloc_lock();
+		new_fl2 = locks_alloc_lock();
+	}
 
 	lock_kernel();
 	if (request->fl_type != F_UNLCK) {
@@ -832,14 +839,7 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 
-	error = -ENOLCK; /* "no luck" */
-	if (!(new_fl && new_fl2))
-		goto out;
-
 	/*
-	 * We've allocated the new locks in advance, so there are no
-	 * errors possible (and no blocking operations) from here on.
-	 * 
 	 * Find the first old lock with the same owner as the new lock.
 	 */
 	
@@ -936,10 +936,25 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
 		before = &fl->fl_next;
 	}
 
+	/*
+	 * The above code only modifies existing locks in case of
+	 * merging or replacing.  If new lock(s) need to be inserted
+	 * all modifications are done bellow this, so it's safe yet to
+	 * bail out.
+	 */
+	error = -ENOLCK; /* "no luck" */
+	if (right && left == right && !new_fl2)
+		goto out;
+
 	error = 0;
 	if (!added) {
 		if (request->fl_type == F_UNLCK)
 			goto out;
+
+		if (!new_fl) {
+			error = -ENOLCK;
+			goto out;
+		}
 		locks_copy_lock(new_fl, request);
 		locks_insert_lock(before, new_fl);
 		new_fl = NULL;
@@ -1879,19 +1894,18 @@ out:
  */
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
-	struct file_lock lock, **before;
+	struct file_lock lock;
 
 	/*
 	 * If there are no locks held on this file, we don't need to call
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
-	before = &filp->f_dentry->d_inode->i_flock;
-	if (*before == NULL)
+	if (!filp->f_dentry->d_inode->i_flock)
 		return;
 
 	lock.fl_type = F_UNLCK;
-	lock.fl_flags = FL_POSIX;
+	lock.fl_flags = FL_POSIX | FL_CLOSE;
 	lock.fl_start = 0;
 	lock.fl_end = OFFSET_MAX;
 	lock.fl_owner = owner;
@@ -1900,25 +1914,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	lock.fl_ops = NULL;
 	lock.fl_lmops = NULL;
 
-	if (filp->f_op && filp->f_op->lock != NULL) {
+	if (filp->f_op && filp->f_op->lock != NULL)
 		filp->f_op->lock(filp, F_SETLK, &lock);
-		goto out;
-	}
+	else
+		posix_lock_file(filp, &lock);
 
-	/* Can't use posix_lock_file here; we need to remove it no matter
-	 * which pid we have.
-	 */
-	lock_kernel();
-	while (*before != NULL) {
-		struct file_lock *fl = *before;
-		if (IS_POSIX(fl) && posix_same_owner(fl, &lock)) {
-			locks_delete_lock(before);
-			continue;
-		}
-		before = &fl->fl_next;
-	}
-	unlock_kernel();
-out:
 	if (lock.fl_ops && lock.fl_ops->fl_release_private)
 		lock.fl_ops->fl_release_private(&lock);
 }
@@ -2204,63 +2204,6 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 
 EXPORT_SYMBOL(lock_may_write);
 
-static inline void __steal_locks(struct file *file, fl_owner_t from)
-{
-	struct inode *inode = file->f_dentry->d_inode;
-	struct file_lock *fl = inode->i_flock;
-
-	while (fl) {
-		if (fl->fl_file == file && fl->fl_owner == from)
-			fl->fl_owner = current->files;
-		fl = fl->fl_next;
-	}
-}
-
-/* When getting ready for executing a binary, we make sure that current
- * has a files_struct on its own. Before dropping the old files_struct,
- * we take over ownership of all locks for all file descriptors we own.
- * Note that we may accidentally steal a lock for a file that a sibling
- * has created since the unshare_files() call.
- */
-void steal_locks(fl_owner_t from)
-{
-	struct files_struct *files = current->files;
-	int i, j;
-	struct fdtable *fdt;
-
-	if (from == files)
-		return;
-
-	lock_kernel();
-	j = 0;
-
-	/*
-	 * We are not taking a ref to the file structures, so
-	 * we need to acquire ->file_lock.
-	 */
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	for (;;) {
-		unsigned long set;
-		i = j * __NFDBITS;
-		if (i >= fdt->max_fdset || i >= fdt->max_fds)
-			break;
-		set = fdt->open_fds->fds_bits[j++];
-		while (set) {
-			if (set & 1) {
-				struct file *file = fdt->fd[i];
-				if (file)
-					__steal_locks(file, from);
-			}
-			i++;
-			set >>= 1;
-		}
-	}
-	spin_unlock(&files->file_lock);
-	unlock_kernel();
-}
-EXPORT_SYMBOL(steal_locks);
-
 static int __init filelock_init(void)
 {
 	filelock_cache = kmem_cache_create("file_lock_cache",
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 69224d1fe043..2b0a389d1987 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -60,8 +60,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2dcccf1d1b7f..a6fb509b7341 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -19,7 +19,7 @@
 
 static void minix_read_inode(struct inode * inode);
 static int minix_write_inode(struct inode * inode, int wait);
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
 static void minix_delete_inode(struct inode *inode)
@@ -296,11 +296,11 @@ out_bad_sb:
 	return -EINVAL;
 }
 
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf)
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct minix_sb_info *sbi = minix_sb(sb);
-	buf->f_type = sb->s_magic;
-	buf->f_bsize = sb->s_blocksize;
+	struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
 	buf->f_bfree = minix_count_free_blocks(sbi);
 	buf->f_bavail = buf->f_bfree;
@@ -559,10 +559,11 @@ void minix_truncate(struct inode * inode)
 		V2_minix_truncate(inode);
 }
 
-static struct super_block *minix_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int minix_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+			   mnt);
 }
 
 static struct file_system_type minix_fs_type = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 9bf2eb30e6f4..1e4598247d0b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -707,9 +707,9 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
-	pgoff_t end = -1;		/* Inclusive */
+	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int is_range = 0;
+	int range_whole = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -721,16 +721,14 @@ mpage_writepages(struct address_space *mapping,
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE) {
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
 	} else {
-		index = 0;			  /* whole-file sweep */
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -759,7 +757,7 @@ retry:
 				continue;
 			}
 
-			if (unlikely(is_range) && page->index > end) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				continue;
@@ -810,7 +808,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 5b76ccd19e3f..9e44158a7540 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -661,11 +661,12 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *msdos_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
 }
 
 static struct file_system_type msdos_fs_type = {
diff --git a/fs/namei.c b/fs/namei.c
index 96723ae83c89..c784e8bb57a3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1080,8 +1080,8 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 	nd->flags = flags;
 	nd->depth = 0;
 
-	read_lock(&current->fs->lock);
 	if (*name=='/') {
+		read_lock(&current->fs->lock);
 		if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
 			nd->mnt = mntget(current->fs->altrootmnt);
 			nd->dentry = dget(current->fs->altroot);
@@ -1092,48 +1092,49 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 		}
 		nd->mnt = mntget(current->fs->rootmnt);
 		nd->dentry = dget(current->fs->root);
+		read_unlock(&current->fs->lock);
 	} else if (dfd == AT_FDCWD) {
+		read_lock(&current->fs->lock);
 		nd->mnt = mntget(current->fs->pwdmnt);
 		nd->dentry = dget(current->fs->pwd);
+		read_unlock(&current->fs->lock);
 	} else {
 		struct dentry *dentry;
 
 		file = fget_light(dfd, &fput_needed);
 		retval = -EBADF;
 		if (!file)
-			goto unlock_fail;
+			goto out_fail;
 
 		dentry = file->f_dentry;
 
 		retval = -ENOTDIR;
 		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_unlock_fail;
+			goto fput_fail;
 
 		retval = file_permission(file, MAY_EXEC);
 		if (retval)
-			goto fput_unlock_fail;
+			goto fput_fail;
 
 		nd->mnt = mntget(file->f_vfsmnt);
 		nd->dentry = dget(dentry);
 
 		fput_light(file, fput_needed);
 	}
-	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
 	retval = link_path_walk(name, nd);
 out:
 	if (likely(retval == 0)) {
 		if (unlikely(current->audit_context && nd && nd->dentry &&
 				nd->dentry->d_inode))
-		audit_inode(name, nd->dentry->d_inode, flags);
+		audit_inode(name, nd->dentry->d_inode);
 	}
+out_fail:
 	return retval;
 
-fput_unlock_fail:
+fput_fail:
 	fput_light(file, fput_needed);
-unlock_fail:
-	read_unlock(&current->fs->lock);
-	return retval;
+	goto out_fail;
 }
 
 int fastcall path_lookup(const char *name, unsigned int flags,
@@ -2242,14 +2243,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	int error;
 	char * to;
 
-	if (flags != 0)
+	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
 		return -EINVAL;
 
 	to = getname(newname);
 	if (IS_ERR(to))
 		return PTR_ERR(to);
 
-	error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
+	error = __user_walk_fd(olddfd, oldname,
+			       flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+			       &old_nd);
 	if (error)
 		goto exit;
 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
@@ -2576,8 +2579,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage,
-				NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..866430bb024d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -86,6 +86,15 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 	return mnt;
 }
 
+int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+{
+	mnt->mnt_sb = sb;
+	mnt->mnt_root = dget(sb->s_root);
+	return 0;
+}
+
+EXPORT_SYMBOL(simple_set_mnt);
+
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
@@ -576,8 +585,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	 */
 
 	lock_kernel();
-	if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
-		sb->s_op->umount_begin(sb);
+	if (sb->s_op->umount_begin)
+		sb->s_op->umount_begin(mnt, flags);
 	unlock_kernel();
 
 	/*
@@ -1163,13 +1172,46 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
 }
 
 /*
+ * go through the vfsmounts we've just consigned to the graveyard to
+ * - check that they're still dead
+ * - delete the vfsmount from the appropriate namespace under lock
+ * - dispose of the corpse
+ */
+static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
+{
+	struct namespace *namespace;
+	struct vfsmount *mnt;
+
+	while (!list_empty(graveyard)) {
+		LIST_HEAD(umounts);
+		mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
+		list_del_init(&mnt->mnt_expire);
+
+		/* don't do anything if the namespace is dead - all the
+		 * vfsmounts from it are going away anyway */
+		namespace = mnt->mnt_namespace;
+		if (!namespace || !namespace->root)
+			continue;
+		get_namespace(namespace);
+
+		spin_unlock(&vfsmount_lock);
+		down_write(&namespace_sem);
+		expire_mount(mnt, mounts, &umounts);
+		up_write(&namespace_sem);
+		release_mounts(&umounts);
+		mntput(mnt);
+		put_namespace(namespace);
+		spin_lock(&vfsmount_lock);
+	}
+}
+
+/*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
  * here
  */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
-	struct namespace *namespace;
 	struct vfsmount *mnt, *next;
 	LIST_HEAD(graveyard);
 
@@ -1193,38 +1235,79 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		list_move(&mnt->mnt_expire, &graveyard);
 	}
 
-	/*
-	 * go through the vfsmounts we've just consigned to the graveyard to
-	 * - check that they're still dead
-	 * - delete the vfsmount from the appropriate namespace under lock
-	 * - dispose of the corpse
-	 */
-	while (!list_empty(&graveyard)) {
-		LIST_HEAD(umounts);
-		mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
-		list_del_init(&mnt->mnt_expire);
+	expire_mount_list(&graveyard, mounts);
 
-		/* don't do anything if the namespace is dead - all the
-		 * vfsmounts from it are going away anyway */
-		namespace = mnt->mnt_namespace;
-		if (!namespace || !namespace->root)
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+{
+	struct vfsmount *this_parent = parent;
+	struct list_head *next;
+	int found = 0;
+
+repeat:
+	next = this_parent->mnt_mounts.next;
+resume:
+	while (next != &this_parent->mnt_mounts) {
+		struct list_head *tmp = next;
+		struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+
+		next = tmp->next;
+		if (!(mnt->mnt_flags & MNT_SHRINKABLE))
 			continue;
-		get_namespace(namespace);
+		/*
+		 * Descend a level if the d_mounts list is non-empty.
+		 */
+		if (!list_empty(&mnt->mnt_mounts)) {
+			this_parent = mnt;
+			goto repeat;
+		}
 
-		spin_unlock(&vfsmount_lock);
-		down_write(&namespace_sem);
-		expire_mount(mnt, mounts, &umounts);
-		up_write(&namespace_sem);
-		release_mounts(&umounts);
-		mntput(mnt);
-		put_namespace(namespace);
-		spin_lock(&vfsmount_lock);
+		if (!propagate_mount_busy(mnt, 1)) {
+			mntget(mnt);
+			list_move_tail(&mnt->mnt_expire, graveyard);
+			found++;
+		}
 	}
+	/*
+	 * All done at this level ... ascend and resume the search
+	 */
+	if (this_parent != parent) {
+		next = this_parent->mnt_child.next;
+		this_parent = this_parent->mnt_parent;
+		goto resume;
+	}
+	return found;
+}
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ */
+void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+{
+	LIST_HEAD(graveyard);
+	int found;
+
+	spin_lock(&vfsmount_lock);
+
+	/* extract submounts of 'mountpoint' from the expiration list */
+	while ((found = select_submounts(mountpoint, &graveyard)) != 0)
+		expire_mount_list(&graveyard, mounts);
 
 	spin_unlock(&vfsmount_lock);
 }
 
-EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+EXPORT_SYMBOL_GPL(shrink_submounts);
 
 /*
  * Some copy_from_user() implementations do not return the exact number of
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a1f3e972c6ef..90d2ea28f333 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -39,7 +39,7 @@
 
 static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
-static int  ncp_statfs(struct super_block *, struct kstatfs *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
 
 static kmem_cache_t * ncp_inode_cachep;
 
@@ -724,13 +724,14 @@ static void ncp_put_super(struct super_block *sb)
 	kfree(server);
 }
 
-static int ncp_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct dentry* d;
 	struct inode* i;
 	struct ncp_inode_info* ni;
 	struct ncp_server* s;
 	struct ncp_volume_info vi;
+	struct super_block *sb = dentry->d_sb;
 	int err;
 	__u8 dh;
 	
@@ -957,10 +958,10 @@ out:
 	return result;
 }
 
-static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ncp_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ncp_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
 }
 
 static struct file_system_type ncp_fs_type = {
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ec61fd56a1a9..0b572a0c1967 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,14 +4,16 @@
 
 obj-$(CONFIG_NFS_FS) += nfs.o
 
-nfs-y 			:= dir.o file.o inode.o nfs2xdr.o pagelist.o \
-			   proc.o read.o symlink.o unlink.o write.o
+nfs-y 			:= dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \
+			   proc.o read.o symlink.o unlink.o write.o \
+			   namespace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
 nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
-			   callback.o callback_xdr.o callback_proc.o
+			   callback.o callback_xdr.o callback_proc.o \
+			   nfs4namespace.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-objs		:= $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 90c95adc8c1b..d53f8c6a9ecb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -182,8 +182,6 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 /*
  * Define NFS4 callback program
  */
-extern struct svc_version nfs4_callback_version1;
-
 static struct svc_version *nfs4_callback_version[] = {
 	[1] = &nfs4_callback_version1,
 };
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05c38cf40b69..c92991328d9a 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -202,7 +202,7 @@ static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xd
 	status = decode_fh(xdr, &args->fh);
 out:
 	dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
-	return 0;
+	return status;
 }
 
 static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cae74dd4c7f5..3ddda6f7ecc2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	lock_kernel();
 
-	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0) {
 		unlock_kernel();
 		return res;
@@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 	return (nd->intent.open.flags & O_EXCL) != 0;
 }
 
+static inline int nfs_reval_fsid(struct inode *dir,
+		struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+
+	if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		/* Revalidate fsid on root dir */
+		return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+	return 0;
+}
+
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *res;
@@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		res = ERR_PTR(error);
 		goto out_unlock;
 	}
+	error = nfs_reval_fsid(dir, &fhandle, &fattr);
+	if (error < 0) {
+		res = ERR_PTR(error);
+		goto out_unlock;
+	}
 	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c72b0c07283..402005c35ab3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -892,7 +892,7 @@ out:
  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
  *
  */
-int nfs_init_directcache(void)
+int __init nfs_init_directcache(void)
 {
 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
 						sizeof(struct nfs_direct_req),
@@ -906,10 +906,10 @@ int nfs_init_directcache(void)
 }
 
 /**
- * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
  *
  */
-void nfs_destroy_directcache(void)
+void __exit nfs_destroy_directcache(void)
 {
 	if (kmem_cache_destroy(nfs_direct_cachep))
 		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6e..add289138836 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -43,7 +43,7 @@ static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
 static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
 static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t);
 static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t);
-static int  nfs_file_flush(struct file *);
+static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
@@ -127,23 +127,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 }
 
 /**
- * nfs_revalidate_file - Revalidate the page cache & related metadata
- * @inode - pointer to inode struct
- * @file - pointer to file
- */
-static int nfs_revalidate_file(struct inode *inode, struct file *filp)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int retval = 0;
-
-	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
-			|| nfs_attribute_timeout(inode))
-		retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	nfs_revalidate_mapping(inode, filp->f_mapping);
-	return 0;
-}
-
-/**
  * nfs_revalidate_size - Revalidate the file size
  * @inode - pointer to inode struct
  * @file - pointer to struct file
@@ -188,7 +171,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
  *
  */
 static int
-nfs_file_flush(struct file *file)
+nfs_file_flush(struct file *file, fl_owner_t id)
 {
 	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
 	struct inode	*inode = file->f_dentry->d_inode;
@@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long) pos);
 
-	result = nfs_revalidate_file(inode, iocb->ki_filp);
+	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
 	if (!result)
 		result = generic_file_aio_read(iocb, buf, count, pos);
@@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long long) *ppos);
 
-	res = nfs_revalidate_file(inode, filp);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (!res)
 		res = generic_file_sendfile(filp, ppos, count, actor, target);
 	return res;
@@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dfprintk(VFS, "nfs: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	status = nfs_revalidate_file(inode, file);
+	status = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (!status)
 		status = generic_file_mmap(file, vma);
 	return status;
@@ -320,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
 
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
-	/* FIXME: we really should cancel any unstarted writes on this page */
+	struct inode *inode = page->mapping->host;
+
+	/* Cancel any unstarted writes on this page */
+	if (offset == 0)
+		nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
@@ -373,7 +360,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 		if (result)
 			goto out;
 	}
-	nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 
 	result = count;
 	if (!count)
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3fab5b0cfc5a..b81e7ed3c902 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -47,7 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
-#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_fs.h>
 
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d0b991a92327..51bc88b662fe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,8 @@
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -44,89 +46,17 @@
 #include "callback.h"
 #include "delegation.h"
 #include "iostat.h"
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 #define NFS_PARANOIA 1
 
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
-
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
-static struct inode *nfs_alloc_inode(struct super_block *sb);
-static void nfs_destroy_inode(struct inode *);
-static int nfs_write_inode(struct inode *,int);
-static void nfs_delete_inode(struct inode *);
-static void nfs_clear_inode(struct inode *);
-static void nfs_umount_begin(struct super_block *);
-static int  nfs_statfs(struct super_block *, struct kstatfs *);
-static int  nfs_show_options(struct seq_file *, struct vfsmount *);
-static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
 
-static struct rpc_program	nfs_program;
-
-static struct super_operations nfs_sops = { 
-	.alloc_inode	= nfs_alloc_inode,
-	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
-	.delete_inode	= nfs_delete_inode,
-	.statfs		= nfs_statfs,
-	.clear_inode	= nfs_clear_inode,
-	.umount_begin	= nfs_umount_begin,
-	.show_options	= nfs_show_options,
-	.show_stats	= nfs_show_stats,
-};
-
-/*
- * RPC cruft for NFS
- */
-static struct rpc_stat		nfs_rpcstat = {
-	.program		= &nfs_program
-};
-static struct rpc_version *	nfs_version[] = {
-	NULL,
-	NULL,
-	&nfs_version2,
-#if defined(CONFIG_NFS_V3)
-	&nfs_version3,
-#elif defined(CONFIG_NFS_V4)
-	NULL,
-#endif
-#if defined(CONFIG_NFS_V4)
-	&nfs_version4,
-#endif
-};
-
-static struct rpc_program	nfs_program = {
-	.name			= "nfs",
-	.number			= NFS_PROGRAM,
-	.nrvers			= ARRAY_SIZE(nfs_version),
-	.version		= nfs_version,
-	.stats			= &nfs_rpcstat,
-	.pipe_dir_name		= "/nfs",
-};
-
-#ifdef CONFIG_NFS_V3_ACL
-static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version *	nfsacl_version[] = {
-	[3]			= &nfsacl_version3,
-};
-
-struct rpc_program		nfsacl_program = {
-	.name =			"nfsacl",
-	.number =		NFS_ACL_PROGRAM,
-	.nrvers =		ARRAY_SIZE(nfsacl_version),
-	.version =		nfsacl_version,
-	.stats =		&nfsacl_rpcstat,
-};
-#endif  /* CONFIG_NFS_V3_ACL */
+static kmem_cache_t * nfs_inode_cachep;
 
 static inline unsigned long
 nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
@@ -134,8 +64,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-static int
-nfs_write_inode(struct inode *inode, int sync)
+int nfs_write_inode(struct inode *inode, int sync)
 {
 	int flags = sync ? FLUSH_SYNC : 0;
 	int ret;
@@ -146,31 +75,15 @@ nfs_write_inode(struct inode *inode, int sync)
 	return 0;
 }
 
-static void
-nfs_delete_inode(struct inode * inode)
+void nfs_clear_inode(struct inode *inode)
 {
-	dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
-
-	truncate_inode_pages(&inode->i_data, 0);
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct rpc_cred *cred;
 
-	nfs_wb_all(inode);
 	/*
 	 * The following should never happen...
 	 */
-	if (nfs_have_writebacks(inode)) {
-		printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
-	}
-
-	clear_inode(inode);
-}
-
-static void
-nfs_clear_inode(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct rpc_cred *cred;
-
-	nfs_wb_all(inode);
+	BUG_ON(nfs_have_writebacks(inode));
 	BUG_ON (!list_empty(&nfsi->open_files));
 	nfs_zap_acl_cache(inode);
 	cred = nfsi->cache_access.cred;
@@ -179,554 +92,6 @@ nfs_clear_inode(struct inode *inode)
 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 
-void
-nfs_umount_begin(struct super_block *sb)
-{
-	struct rpc_clnt	*rpc = NFS_SB(sb)->client;
-
-	/* -EIO all pending I/O */
-	if (!IS_ERR(rpc))
-		rpc_killall_tasks(rpc);
-	rpc = NFS_SB(sb)->client_acl;
-	if (!IS_ERR(rpc))
-		rpc_killall_tasks(rpc);
-}
-
-
-static inline unsigned long
-nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
-{
-	/* make sure blocksize is a power of two */
-	if ((bsize & (bsize - 1)) || nrbitsp) {
-		unsigned char	nrbits;
-
-		for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
-			;
-		bsize = 1 << nrbits;
-		if (nrbitsp)
-			*nrbitsp = nrbits;
-	}
-
-	return bsize;
-}
-
-/*
- * Calculate the number of 512byte blocks used.
- */
-static inline unsigned long
-nfs_calc_block_size(u64 tsize)
-{
-	loff_t used = (tsize + 511) >> 9;
-	return (used > ULONG_MAX) ? ULONG_MAX : used;
-}
-
-/*
- * Compute and set NFS server blocksize
- */
-static inline unsigned long
-nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
-{
-	if (bsize < NFS_MIN_FILE_IO_SIZE)
-		bsize = NFS_DEF_FILE_IO_SIZE;
-	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
-		bsize = NFS_MAX_FILE_IO_SIZE;
-
-	return nfs_block_bits(bsize, nrbitsp);
-}
-
-/*
- * Obtain the root inode of the file system.
- */
-static struct inode *
-nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
-{
-	struct nfs_server	*server = NFS_SB(sb);
-	int			error;
-
-	error = server->rpc_ops->getroot(server, rootfh, fsinfo);
-	if (error < 0) {
-		dprintk("nfs_get_root: getattr error = %d\n", -error);
-		return ERR_PTR(error);
-	}
-
-	return nfs_fhget(sb, rootfh, fsinfo->fattr);
-}
-
-/*
- * Do NFS version-independent mount processing, and sanity checking
- */
-static int
-nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
-{
-	struct nfs_server	*server;
-	struct inode		*root_inode;
-	struct nfs_fattr	fattr;
-	struct nfs_fsinfo	fsinfo = {
-					.fattr = &fattr,
-				};
-	struct nfs_pathconf pathinfo = {
-			.fattr = &fattr,
-	};
-	int no_root_error = 0;
-	unsigned long max_rpc_payload;
-
-	/* We probably want something more informative here */
-	snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
-
-	server = NFS_SB(sb);
-
-	sb->s_magic      = NFS_SUPER_MAGIC;
-
-	server->io_stats = nfs_alloc_iostats();
-	if (server->io_stats == NULL)
-		return -ENOMEM;
-
-	root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
-	/* Did getting the root inode fail? */
-	if (IS_ERR(root_inode)) {
-		no_root_error = PTR_ERR(root_inode);
-		goto out_no_root;
-	}
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root) {
-		no_root_error = -ENOMEM;
-		goto out_no_root;
-	}
-	sb->s_root->d_op = server->rpc_ops->dentry_ops;
-
-	/* mount time stamp, in seconds */
-	server->mount_time = jiffies;
-
-	/* Get some general file system info */
-	if (server->namelen == 0 &&
-	    server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
-		server->namelen = pathinfo.max_namelen;
-	/* Work out a lot of parameters */
-	if (server->rsize == 0)
-		server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
-	if (server->wsize == 0)
-		server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
-
-	if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
-		server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
-	if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
-		server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
-
-	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
-	if (server->rsize > max_rpc_payload)
-		server->rsize = max_rpc_payload;
-	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
-		server->rsize = NFS_MAX_FILE_IO_SIZE;
-	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	if (server->wsize > max_rpc_payload)
-		server->wsize = max_rpc_payload;
-	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
-		server->wsize = NFS_MAX_FILE_IO_SIZE;
-	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	if (sb->s_blocksize == 0)
-		sb->s_blocksize = nfs_block_bits(server->wsize,
-							 &sb->s_blocksize_bits);
-	server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
-
-	server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
-	if (server->dtsize > PAGE_CACHE_SIZE)
-		server->dtsize = PAGE_CACHE_SIZE;
-	if (server->dtsize > server->rsize)
-		server->dtsize = server->rsize;
-
-	if (server->flags & NFS_MOUNT_NOAC) {
-		server->acregmin = server->acregmax = 0;
-		server->acdirmin = server->acdirmax = 0;
-		sb->s_flags |= MS_SYNCHRONOUS;
-	}
-	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
-	sb->s_maxbytes = fsinfo.maxfilesize;
-	if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
-		sb->s_maxbytes = MAX_LFS_FILESIZE; 
-
-	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
-	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
-
-	/* We're airborne Set socket buffersize */
-	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
-	return 0;
-	/* Yargs. It didn't work out. */
-out_no_root:
-	dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
-	if (!IS_ERR(root_inode))
-		iput(root_inode);
-	return no_root_error;
-}
-
-static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
-{
-	to->to_initval = timeo * HZ / 10;
-	to->to_retries = retrans;
-	if (!to->to_retries)
-		to->to_retries = 2;
-
-	switch (proto) {
-	case IPPROTO_TCP:
-		if (!to->to_initval)
-			to->to_initval = 60 * HZ;
-		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
-			to->to_initval = NFS_MAX_TCP_TIMEOUT;
-		to->to_increment = to->to_initval;
-		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
-		to->to_exponential = 0;
-		break;
-	case IPPROTO_UDP:
-	default:
-		if (!to->to_initval)
-			to->to_initval = 11 * HZ / 10;
-		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
-			to->to_initval = NFS_MAX_UDP_TIMEOUT;
-		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
-		to->to_exponential = 1;
-		break;
-	}
-}
-
-/*
- * Create an RPC client handle.
- */
-static struct rpc_clnt *
-nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
-{
-	struct rpc_timeout	timeparms;
-	struct rpc_xprt		*xprt = NULL;
-	struct rpc_clnt		*clnt = NULL;
-	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-
-	nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
-
-	server->retrans_timeo = timeparms.to_initval;
-	server->retrans_count = timeparms.to_retries;
-
-	/* create transport and client */
-	xprt = xprt_create_proto(proto, &server->addr, &timeparms);
-	if (IS_ERR(xprt)) {
-		dprintk("%s: cannot create RPC transport. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(xprt));
-		return (struct rpc_clnt *)xprt;
-	}
-	clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				 server->rpc_ops->version, data->pseudoflavor);
-	if (IS_ERR(clnt)) {
-		dprintk("%s: cannot create RPC client. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(xprt));
-		goto out_fail;
-	}
-
-	clnt->cl_intr     = 1;
-	clnt->cl_softrtry = 1;
-
-	return clnt;
-
-out_fail:
-	return clnt;
-}
-
-/*
- * The way this works is that the mount process passes a structure
- * in the data argument which contains the server's IP address
- * and the root file handle obtained from the server's mount
- * daemon. We stash these away in the private superblock fields.
- */
-static int
-nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
-{
-	struct nfs_server	*server;
-	rpc_authflavor_t	authflavor;
-
-	server           = NFS_SB(sb);
-	sb->s_blocksize_bits = 0;
-	sb->s_blocksize = 0;
-	if (data->bsize)
-		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
-	if (data->rsize)
-		server->rsize = nfs_block_size(data->rsize, NULL);
-	if (data->wsize)
-		server->wsize = nfs_block_size(data->wsize, NULL);
-	server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
-
-	server->acregmin = data->acregmin*HZ;
-	server->acregmax = data->acregmax*HZ;
-	server->acdirmin = data->acdirmin*HZ;
-	server->acdirmax = data->acdirmax*HZ;
-
-	/* Start lockd here, before we might error out */
-	if (!(server->flags & NFS_MOUNT_NONLM))
-		lockd_up();
-
-	server->namelen  = data->namlen;
-	server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
-	if (!server->hostname)
-		return -ENOMEM;
-	strcpy(server->hostname, data->hostname);
-
-	/* Check NFS protocol revision and initialize RPC op vector
-	 * and file handle pool. */
-#ifdef CONFIG_NFS_V3
-	if (server->flags & NFS_MOUNT_VER3) {
-		server->rpc_ops = &nfs_v3_clientops;
-		server->caps |= NFS_CAP_READDIRPLUS;
-	} else {
-		server->rpc_ops = &nfs_v2_clientops;
-	}
-#else
-	server->rpc_ops = &nfs_v2_clientops;
-#endif
-
-	/* Fill in pseudoflavor for mount version < 5 */
-	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-		data->pseudoflavor = RPC_AUTH_UNIX;
-	authflavor = data->pseudoflavor;	/* save for sb_init() */
-	/* XXX maybe we want to add a server->pseudoflavor field */
-
-	/* Create RPC client handles */
-	server->client = nfs_create_client(server, data);
-	if (IS_ERR(server->client))
-		return PTR_ERR(server->client);
-	/* RFC 2623, sec 2.3.2 */
-	if (authflavor != RPC_AUTH_UNIX) {
-		struct rpc_auth *auth;
-
-		server->client_sys = rpc_clone_client(server->client);
-		if (IS_ERR(server->client_sys))
-			return PTR_ERR(server->client_sys);
-		auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
-		if (IS_ERR(auth))
-			return PTR_ERR(auth);
-	} else {
-		atomic_inc(&server->client->cl_count);
-		server->client_sys = server->client;
-	}
-	if (server->flags & NFS_MOUNT_VER3) {
-#ifdef CONFIG_NFS_V3_ACL
-		if (!(server->flags & NFS_MOUNT_NOACL)) {
-			server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
-			/* No errors! Assume that Sun nfsacls are supported */
-			if (!IS_ERR(server->client_acl))
-				server->caps |= NFS_CAP_ACLS;
-		}
-#else
-		server->flags &= ~NFS_MOUNT_NOACL;
-#endif /* CONFIG_NFS_V3_ACL */
-		/*
-		 * The VFS shouldn't apply the umask to mode bits. We will
-		 * do so ourselves when necessary.
-		 */
-		sb->s_flags |= MS_POSIXACL;
-		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-			server->namelen = NFS3_MAXNAMLEN;
-		sb->s_time_gran = 1;
-	} else {
-		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-			server->namelen = NFS2_MAXNAMLEN;
-	}
-
-	sb->s_op = &nfs_sops;
-	return nfs_sb_init(sb, authflavor);
-}
-
-static int
-nfs_statfs(struct super_block *sb, struct kstatfs *buf)
-{
-	struct nfs_server *server = NFS_SB(sb);
-	unsigned char blockbits;
-	unsigned long blockres;
-	struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
-	struct nfs_fattr fattr;
-	struct nfs_fsstat res = {
-			.fattr = &fattr,
-	};
-	int error;
-
-	lock_kernel();
-
-	error = server->rpc_ops->statfs(server, rootfh, &res);
-	buf->f_type = NFS_SUPER_MAGIC;
-	if (error < 0)
-		goto out_err;
-
-	/*
-	 * Current versions of glibc do not correctly handle the
-	 * case where f_frsize != f_bsize.  Eventually we want to
-	 * report the value of wtmult in this field.
-	 */
-	buf->f_frsize = sb->s_blocksize;
-
-	/*
-	 * On most *nix systems, f_blocks, f_bfree, and f_bavail
-	 * are reported in units of f_frsize.  Linux hasn't had
-	 * an f_frsize field in its statfs struct until recently,
-	 * thus historically Linux's sys_statfs reports these
-	 * fields in units of f_bsize.
-	 */
-	buf->f_bsize = sb->s_blocksize;
-	blockbits = sb->s_blocksize_bits;
-	blockres = (1 << blockbits) - 1;
-	buf->f_blocks = (res.tbytes + blockres) >> blockbits;
-	buf->f_bfree = (res.fbytes + blockres) >> blockbits;
-	buf->f_bavail = (res.abytes + blockres) >> blockbits;
-
-	buf->f_files = res.tfiles;
-	buf->f_ffree = res.afiles;
-
-	buf->f_namelen = server->namelen;
- out:
-	unlock_kernel();
-	return 0;
-
- out_err:
-	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
-	buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
-	goto out;
-
-}
-
-static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
-{
-	static struct proc_nfs_info {
-		int flag;
-		char *str;
-		char *nostr;
-	} nfs_info[] = {
-		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
-		{ NFS_MOUNT_INTR, ",intr", "" },
-		{ NFS_MOUNT_NOCTO, ",nocto", "" },
-		{ NFS_MOUNT_NOAC, ",noac", "" },
-		{ NFS_MOUNT_NONLM, ",nolock", "" },
-		{ NFS_MOUNT_NOACL, ",noacl", "" },
-		{ 0, NULL, NULL }
-	};
-	struct proc_nfs_info *nfs_infop;
-	char buf[12];
-	char *proto;
-
-	seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
-	seq_printf(m, ",rsize=%d", nfss->rsize);
-	seq_printf(m, ",wsize=%d", nfss->wsize);
-	if (nfss->acregmin != 3*HZ || showdefaults)
-		seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
-	if (nfss->acregmax != 60*HZ || showdefaults)
-		seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
-	if (nfss->acdirmin != 30*HZ || showdefaults)
-		seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
-	if (nfss->acdirmax != 60*HZ || showdefaults)
-		seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
-	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
-		if (nfss->flags & nfs_infop->flag)
-			seq_puts(m, nfs_infop->str);
-		else
-			seq_puts(m, nfs_infop->nostr);
-	}
-	switch (nfss->client->cl_xprt->prot) {
-		case IPPROTO_TCP:
-			proto = "tcp";
-			break;
-		case IPPROTO_UDP:
-			proto = "udp";
-			break;
-		default:
-			snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
-			proto = buf;
-	}
-	seq_printf(m, ",proto=%s", proto);
-	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
-	seq_printf(m, ",retrans=%u", nfss->retrans_count);
-}
-
-static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-
-	nfs_show_mount_options(m, nfss, 0);
-
-	seq_puts(m, ",addr=");
-	seq_escape(m, nfss->hostname, " \t\n\\");
-
-	return 0;
-}
-
-static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
-{
-	int i, cpu;
-	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-	struct rpc_auth *auth = nfss->client->cl_auth;
-	struct nfs_iostats totals = { };
-
-	seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
-
-	/*
-	 * Display all mount option settings
-	 */
-	seq_printf(m, "\n\topts:\t");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
-	nfs_show_mount_options(m, nfss, 1);
-
-	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
-
-	seq_printf(m, "\n\tcaps:\t");
-	seq_printf(m, "caps=0x%x", nfss->caps);
-	seq_printf(m, ",wtmult=%d", nfss->wtmult);
-	seq_printf(m, ",dtsize=%d", nfss->dtsize);
-	seq_printf(m, ",bsize=%d", nfss->bsize);
-	seq_printf(m, ",namelen=%d", nfss->namelen);
-
-#ifdef CONFIG_NFS_V4
-	if (nfss->rpc_ops->version == 4) {
-		seq_printf(m, "\n\tnfsv4:\t");
-		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
-		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
-		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
-	}
-#endif
-
-	/*
-	 * Display security flavor in effect for this mount
-	 */
-	seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
-	if (auth->au_flavor)
-		seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
-
-	/*
-	 * Display superblock I/O counters
-	 */
-	for_each_possible_cpu(cpu) {
-		struct nfs_iostats *stats;
-
-		preempt_disable();
-		stats = per_cpu_ptr(nfss->io_stats, cpu);
-
-		for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-			totals.events[i] += stats->events[i];
-		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-			totals.bytes[i] += stats->bytes[i];
-
-		preempt_enable();
-	}
-
-	seq_printf(m, "\n\tevents:\t");
-	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-		seq_printf(m, "%lu ", totals.events[i]);
-	seq_printf(m, "\n\tbytes:\t");
-	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-		seq_printf(m, "%Lu ", totals.bytes[i]);
-	seq_printf(m, "\n");
-
-	rpc_print_iostats(m, nfss->client);
-
-	return 0;
-}
-
 /**
  * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
  */
@@ -889,6 +254,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+			/* Deal with crossing mountpoints */
+			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+					inode->i_op = &nfs_referral_inode_operations;
+				else
+					inode->i_op = &nfs_mountpoint_inode_operations;
+				inode->i_fop = NULL;
+			}
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
 		else
@@ -1207,6 +580,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
 		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 
+	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	lock_kernel();
 	if (!inode || is_bad_inode(inode))
  		goto out_nowait;
@@ -1220,7 +594,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		status = -ESTALE;
 		/* Do we trust the cached ESTALE? */
 		if (NFS_ATTRTIMEO(inode) != 0) {
-			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) {
+			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
 				/* no */
 			} else
 				goto out;
@@ -1251,8 +625,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	}
 	spin_unlock(&inode->i_lock);
 
-	nfs_revalidate_mapping(inode, inode->i_mapping);
-
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 
@@ -1286,8 +658,7 @@ int nfs_attribute_timeout(struct inode *inode)
  */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-	if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
 			&& !nfs_attribute_timeout(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
 	return __nfs_revalidate_inode(server, inode);
@@ -1298,9 +669,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
  */
-void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret = 0;
+
+	if (NFS_STALE(inode))
+		ret = -ESTALE;
+	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+			|| nfs_attribute_timeout(inode))
+		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
 		nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
@@ -1321,6 +699,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode));
 	}
+	return ret;
 }
 
 /**
@@ -1360,12 +739,6 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
-			&& nfsi->change_attr == fattr->pre_change_attr) {
-		nfsi->change_attr = fattr->change_attr;
-		nfsi->cache_change_attribute = jiffies;
-	}
-
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
 		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
@@ -1399,9 +772,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	int data_unstable;
 
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-		return 0;
-
 	/* Has the inode gone and changed behind our back? */
 	if (nfsi->fileid != fattr->fileid
 			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
@@ -1414,20 +784,13 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0) {
-		if (nfsi->change_attr == fattr->change_attr)
-			goto out;
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-		if (!data_unstable)
-			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-	}
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			nfsi->change_attr != fattr->change_attr)
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-		if (!data_unstable)
-			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-	}
+	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	cur_size = i_size_read(inode);
  	new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1444,7 +807,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	if (inode->i_nlink != fattr->nlink)
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 
-out:
 	if (!timespec_equal(&inode->i_atime, &fattr->atime))
 		nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
 
@@ -1470,7 +832,6 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
 		return 0;
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
 	if (time_after(fattr->time_start, nfsi->last_updated))
 		status = nfs_update_inode(inode, fattr);
 	else
@@ -1495,7 +856,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	spin_lock(&inode->i_lock);
 	if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+		nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 		goto out;
 	}
 	status = nfs_update_inode(inode, fattr);
@@ -1518,6 +879,7 @@ out:
  */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
+	struct nfs_server *server;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_isize, new_isize;
 	unsigned int	invalid = 0;
@@ -1527,9 +889,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__FUNCTION__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-		return 0;
-
 	if (nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
@@ -1539,6 +898,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
+	server = NFS_SERVER(inode);
+	/* Update the fsid if and only if this is the root directory */
+	if (inode == inode->i_sb->s_root->d_inode
+			&& !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		server->fsid = fattr->fsid;
+
 	/*
 	 * Update the read time so we don't revalidate too often.
 	 */
@@ -1548,7 +913,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/* Are we racing with known updates of the metadata on the server? */
 	data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
 	if (data_stable)
-		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
@@ -1612,15 +977,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  		inode->i_blksize = fattr->du.nfs2.blocksize;
  	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4)) {
-		if (nfsi->change_attr != fattr->change_attr) {
-			dprintk("NFS: change_attr change on server for file %s/%ld\n",
-					inode->i_sb->s_id, inode->i_ino);
-			nfsi->change_attr = fattr->change_attr;
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			nfsi->cache_change_attribute = jiffies;
-		} else
-			invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA);
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			nfsi->change_attr != fattr->change_attr) {
+		dprintk("NFS: change_attr change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
+		nfsi->change_attr = fattr->change_attr;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		nfsi->cache_change_attribute = jiffies;
 	}
 
 	/* Update attrtimeo value if we're out of the unstable period */
@@ -1668,190 +1031,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	goto out_err;
 }
 
-/*
- * File system information
- */
-
-static int nfs_set_super(struct super_block *s, void *data)
-{
-	s->s_fs_info = data;
-	return set_anon_super(s, data);
-}
- 
-static int nfs_compare_super(struct super_block *sb, void *data)
-{
-	struct nfs_server *server = data;
-	struct nfs_server *old = NFS_SB(sb);
-
-	if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
-		return 0;
-	if (old->addr.sin_port != server->addr.sin_port)
-		return 0;
-	return !nfs_compare_fh(&old->fh, &server->fh);
-}
-
-static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
-{
-	int error;
-	struct nfs_server *server = NULL;
-	struct super_block *s;
-	struct nfs_fh *root;
-	struct nfs_mount_data *data = raw_data;
-
-	s = ERR_PTR(-EINVAL);
-	if (data == NULL) {
-		dprintk("%s: missing data argument\n", __FUNCTION__);
-		goto out_err;
-	}
-	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
-		dprintk("%s: bad mount version\n", __FUNCTION__);
-		goto out_err;
-	}
-	switch (data->version) {
-		case 1:
-			data->namlen = 0;
-		case 2:
-			data->bsize  = 0;
-		case 3:
-			if (data->flags & NFS_MOUNT_VER3) {
-				dprintk("%s: mount structure version %d does not support NFSv3\n",
-						__FUNCTION__,
-						data->version);
-				goto out_err;
-			}
-			data->root.size = NFS2_FHSIZE;
-			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
-		case 4:
-			if (data->flags & NFS_MOUNT_SECFLAVOUR) {
-				dprintk("%s: mount structure version %d does not support strong security\n",
-						__FUNCTION__,
-						data->version);
-				goto out_err;
-			}
-		case 5:
-			memset(data->context, 0, sizeof(data->context));
-	}
-#ifndef CONFIG_NFS_V3
-	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-	s = ERR_PTR(-EPROTONOSUPPORT);
-	if (data->flags & NFS_MOUNT_VER3) {
-		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-		goto out_err;
-	}
-#endif /* CONFIG_NFS_V3 */
-
-	s = ERR_PTR(-ENOMEM);
-	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (!server)
-		goto out_err;
-	/* Zero out the NFS state stuff */
-	init_nfsv4_state(server);
-	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-
-	root = &server->fh;
-	if (data->flags & NFS_MOUNT_VER3)
-		root->size = data->root.size;
-	else
-		root->size = NFS2_FHSIZE;
-	s = ERR_PTR(-EINVAL);
-	if (root->size > sizeof(root->data)) {
-		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
-		goto out_err;
-	}
-	memcpy(root->data, data->root.data, root->size);
-
-	/* We now require that the mount process passes the remote address */
-	memcpy(&server->addr, &data->addr, sizeof(server->addr));
-	if (server->addr.sin_addr.s_addr == INADDR_ANY) {
-		dprintk("%s: mount program didn't pass remote address!\n",
-				__FUNCTION__);
-		goto out_err;
-	}
-
-	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
-		goto out_err;
-	}
-
-	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-	if (IS_ERR(s) || s->s_root)
-		goto out_rpciod_down;
-
-	s->s_flags = flags;
-
-	error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		return ERR_PTR(error);
-	}
-	s->s_flags |= MS_ACTIVE;
-	return s;
-out_rpciod_down:
-	rpciod_down();
-out_err:
-	kfree(server);
-	return s;
-}
-
-static void nfs_kill_super(struct super_block *s)
-{
-	struct nfs_server *server = NFS_SB(s);
-
-	kill_anon_super(s);
-
-	if (!IS_ERR(server->client))
-		rpc_shutdown_client(server->client);
-	if (!IS_ERR(server->client_sys))
-		rpc_shutdown_client(server->client_sys);
-	if (!IS_ERR(server->client_acl))
-		rpc_shutdown_client(server->client_acl);
-
-	if (!(server->flags & NFS_MOUNT_NONLM))
-		lockd_down();	/* release rpc.lockd */
-
-	rpciod_down();		/* release rpciod */
-
-	nfs_free_iostats(server->io_stats);
-	kfree(server->hostname);
-	kfree(server);
-}
-
-static struct file_system_type nfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs",
-	.get_sb		= nfs_get_sb,
-	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
 
 #ifdef CONFIG_NFS_V4
 
-static void nfs4_clear_inode(struct inode *);
-
-
-static struct super_operations nfs4_sops = { 
-	.alloc_inode	= nfs_alloc_inode,
-	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
-	.delete_inode	= nfs_delete_inode,
-	.statfs		= nfs_statfs,
-	.clear_inode	= nfs4_clear_inode,
-	.umount_begin	= nfs_umount_begin,
-	.show_options	= nfs_show_options,
-	.show_stats	= nfs_show_stats,
-};
-
 /*
  * Clean out any remaining NFSv4 state that might be left over due
  * to open() calls that passed nfs_atomic_lookup, but failed to call
  * nfs_open().
  */
-static void nfs4_clear_inode(struct inode *inode)
+void nfs4_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
@@ -1875,357 +1063,9 @@ static void nfs4_clear_inode(struct inode *inode)
 		nfs4_close_state(state, state->state);
 	}
 }
-
-
-static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
-{
-	struct nfs_server *server;
-	struct nfs4_client *clp = NULL;
-	struct rpc_xprt *xprt = NULL;
-	struct rpc_clnt *clnt = NULL;
-	struct rpc_timeout timeparms;
-	rpc_authflavor_t authflavour;
-	int err = -EIO;
-
-	sb->s_blocksize_bits = 0;
-	sb->s_blocksize = 0;
-	server = NFS_SB(sb);
-	if (data->rsize != 0)
-		server->rsize = nfs_block_size(data->rsize, NULL);
-	if (data->wsize != 0)
-		server->wsize = nfs_block_size(data->wsize, NULL);
-	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
-	server->caps = NFS_CAP_ATOMIC_OPEN;
-
-	server->acregmin = data->acregmin*HZ;
-	server->acregmax = data->acregmax*HZ;
-	server->acdirmin = data->acdirmin*HZ;
-	server->acdirmax = data->acdirmax*HZ;
-
-	server->rpc_ops = &nfs_v4_clientops;
-
-	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
-
-	server->retrans_timeo = timeparms.to_initval;
-	server->retrans_count = timeparms.to_retries;
-
-	clp = nfs4_get_client(&server->addr.sin_addr);
-	if (!clp) {
-		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	/* Now create transport and client */
-	authflavour = RPC_AUTH_UNIX;
-	if (data->auth_flavourlen != 0) {
-		if (data->auth_flavourlen != 1) {
-			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
-					__FUNCTION__, data->auth_flavourlen);
-			err = -EINVAL;
-			goto out_fail;
-		}
-		if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
-			err = -EFAULT;
-			goto out_fail;
-		}
-	}
-
-	down_write(&clp->cl_sem);
-	if (IS_ERR(clp->cl_rpcclient)) {
-		xprt = xprt_create_proto(data->proto, &server->addr, &timeparms);
-		if (IS_ERR(xprt)) {
-			up_write(&clp->cl_sem);
-			err = PTR_ERR(xprt);
-			dprintk("%s: cannot create RPC transport. Error = %d\n",
-					__FUNCTION__, err);
-			goto out_fail;
-		}
-		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				server->rpc_ops->version, authflavour);
-		if (IS_ERR(clnt)) {
-			up_write(&clp->cl_sem);
-			err = PTR_ERR(clnt);
-			dprintk("%s: cannot create RPC client. Error = %d\n",
-					__FUNCTION__, err);
-			goto out_fail;
-		}
-		clnt->cl_intr     = 1;
-		clnt->cl_softrtry = 1;
-		clp->cl_rpcclient = clnt;
-		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-		nfs_idmap_new(clp);
-	}
-	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-	clnt = rpc_clone_client(clp->cl_rpcclient);
-	if (!IS_ERR(clnt))
-			server->nfs4_state = clp;
-	up_write(&clp->cl_sem);
-	clp = NULL;
-
-	if (IS_ERR(clnt)) {
-		err = PTR_ERR(clnt);
-		dprintk("%s: cannot create RPC client. Error = %d\n",
-				__FUNCTION__, err);
-		return err;
-	}
-
-	server->client    = clnt;
-
-	if (server->nfs4_state->cl_idmap == NULL) {
-		dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
-		return -ENOMEM;
-	}
-
-	if (clnt->cl_auth->au_flavor != authflavour) {
-		struct rpc_auth *auth;
-
-		auth = rpcauth_create(authflavour, clnt);
-		if (IS_ERR(auth)) {
-			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-			return PTR_ERR(auth);
-		}
-	}
-
-	sb->s_time_gran = 1;
-
-	sb->s_op = &nfs4_sops;
-	err = nfs_sb_init(sb, authflavour);
-	if (err == 0)
-		return 0;
-out_fail:
-	if (clp)
-		nfs4_put_client(clp);
-	return err;
-}
-
-static int nfs4_compare_super(struct super_block *sb, void *data)
-{
-	struct nfs_server *server = data;
-	struct nfs_server *old = NFS_SB(sb);
-
-	if (strcmp(server->hostname, old->hostname) != 0)
-		return 0;
-	if (strcmp(server->mnt_path, old->mnt_path) != 0)
-		return 0;
-	return 1;
-}
-
-static void *
-nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
-{
-	void *p = NULL;
-
-	if (!src->len)
-		return ERR_PTR(-EINVAL);
-	if (src->len < maxlen)
-		maxlen = src->len;
-	if (dst == NULL) {
-		p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
-		if (p == NULL)
-			return ERR_PTR(-ENOMEM);
-	}
-	if (copy_from_user(dst, src->data, maxlen)) {
-		kfree(p);
-		return ERR_PTR(-EFAULT);
-	}
-	dst[maxlen] = '\0';
-	return dst;
-}
-
-static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
-{
-	int error;
-	struct nfs_server *server;
-	struct super_block *s;
-	struct nfs4_mount_data *data = raw_data;
-	void *p;
-
-	if (data == NULL) {
-		dprintk("%s: missing data argument\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
-	}
-	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
-		dprintk("%s: bad mount version\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
-	}
-
-	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (!server)
-		return ERR_PTR(-ENOMEM);
-	/* Zero out the NFS state stuff */
-	init_nfsv4_state(server);
-	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-
-	p = nfs_copy_user_string(NULL, &data->hostname, 256);
-	if (IS_ERR(p))
-		goto out_err;
-	server->hostname = p;
-
-	p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
-	if (IS_ERR(p))
-		goto out_err;
-	server->mnt_path = p;
-
-	p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
-			sizeof(server->ip_addr) - 1);
-	if (IS_ERR(p))
-		goto out_err;
-
-	/* We now require that the mount process passes the remote address */
-	if (data->host_addrlen != sizeof(server->addr)) {
-		s = ERR_PTR(-EINVAL);
-		goto out_free;
-	}
-	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-		s = ERR_PTR(-EFAULT);
-		goto out_free;
-	}
-	if (server->addr.sin_family != AF_INET ||
-	    server->addr.sin_addr.s_addr == INADDR_ANY) {
-		dprintk("%s: mount program didn't pass remote IP address!\n",
-				__FUNCTION__);
-		s = ERR_PTR(-EINVAL);
-		goto out_free;
-	}
-
-	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
-		goto out_free;
-	}
-
-	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-
-	if (IS_ERR(s) || s->s_root)
-		goto out_free;
-
-	s->s_flags = flags;
-
-	error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		return ERR_PTR(error);
-	}
-	s->s_flags |= MS_ACTIVE;
-	return s;
-out_err:
-	s = (struct super_block *)p;
-out_free:
-	kfree(server->mnt_path);
-	kfree(server->hostname);
-	kfree(server);
-	return s;
-}
-
-static void nfs4_kill_super(struct super_block *sb)
-{
-	struct nfs_server *server = NFS_SB(sb);
-
-	nfs_return_all_delegations(sb);
-	kill_anon_super(sb);
-
-	nfs4_renewd_prepare_shutdown(server);
-
-	if (server->client != NULL && !IS_ERR(server->client))
-		rpc_shutdown_client(server->client);
-
-	destroy_nfsv4_state(server);
-
-	rpciod_down();
-
-	nfs_free_iostats(server->io_stats);
-	kfree(server->hostname);
-	kfree(server);
-}
-
-static struct file_system_type nfs4_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs4",
-	.get_sb		= nfs4_get_sb,
-	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
-static int param_set_port(const char *val, struct kernel_param *kp)
-{
-	char *endp;
-	int num = simple_strtol(val, &endp, 0);
-	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
-		return -EINVAL;
-	*((int *)kp->arg) = num;
-	return 0;
-}
-
-module_param_call(callback_tcpport, param_set_port, param_get_int,
-		 &nfs_callback_set_tcpport, 0644);
-
-static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
-{
-	char *endp;
-	int num = simple_strtol(val, &endp, 0);
-	int jif = num * HZ;
-	if (endp == val || *endp || num < 0 || jif < num)
-		return -EINVAL;
-	*((int *)kp->arg) = jif;
-	return 0;
-}
-
-module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
-		 &nfs_idmap_cache_timeout, 0644);
-
-#define nfs4_init_once(nfsi) \
-	do { \
-		INIT_LIST_HEAD(&(nfsi)->open_states); \
-		nfsi->delegation = NULL; \
-		nfsi->delegation_state = 0; \
-		init_rwsem(&nfsi->rwsem); \
-	} while(0)
-
-static inline int register_nfs4fs(void)
-{
-	int ret;
-
-	ret = nfs_register_sysctl();
-	if (ret != 0)
-		return ret;
-	ret = register_filesystem(&nfs4_fs_type);
-	if (ret != 0)
-		nfs_unregister_sysctl();
-	return ret;
-}
-
-static inline void unregister_nfs4fs(void)
-{
-	unregister_filesystem(&nfs4_fs_type);
-	nfs_unregister_sysctl();
-}
-#else
-#define nfs4_init_once(nfsi) \
-	do { } while (0)
-#define register_nfs4fs() (0)
-#define unregister_nfs4fs()
 #endif
 
-extern int nfs_init_nfspagecache(void);
-extern void nfs_destroy_nfspagecache(void);
-extern int nfs_init_readpagecache(void);
-extern void nfs_destroy_readpagecache(void);
-extern int nfs_init_writepagecache(void);
-extern void nfs_destroy_writepagecache(void);
-#ifdef CONFIG_NFS_DIRECTIO
-extern int nfs_init_directcache(void);
-extern void nfs_destroy_directcache(void);
-#endif
-
-static kmem_cache_t * nfs_inode_cachep;
-
-static struct inode *nfs_alloc_inode(struct super_block *sb)
+struct inode *nfs_alloc_inode(struct super_block *sb)
 {
 	struct nfs_inode *nfsi;
 	nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
@@ -2244,11 +1084,21 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
 	return &nfsi->vfs_inode;
 }
 
-static void nfs_destroy_inode(struct inode *inode)
+void nfs_destroy_inode(struct inode *inode)
 {
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+	INIT_LIST_HEAD(&nfsi->open_states);
+	nfsi->delegation = NULL;
+	nfsi->delegation_state = 0;
+	init_rwsem(&nfsi->rwsem);
+#endif
+}
+
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
@@ -2269,7 +1119,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 	}
 }
  
-static int nfs_init_inodecache(void)
+static int __init nfs_init_inodecache(void)
 {
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
@@ -2282,7 +1132,7 @@ static int nfs_init_inodecache(void)
 	return 0;
 }
 
-static void nfs_destroy_inodecache(void)
+static void __exit nfs_destroy_inodecache(void)
 {
 	if (kmem_cache_destroy(nfs_inode_cachep))
 		printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n");
@@ -2311,29 +1161,22 @@ static int __init init_nfs_fs(void)
 	if (err)
 		goto out1;
 
-#ifdef CONFIG_NFS_DIRECTIO
 	err = nfs_init_directcache();
 	if (err)
 		goto out0;
-#endif
 
 #ifdef CONFIG_PROC_FS
 	rpc_proc_register(&nfs_rpcstat);
 #endif
-        err = register_filesystem(&nfs_fs_type);
-	if (err)
-		goto out;
-	if ((err = register_nfs4fs()) != 0)
+	if ((err = register_nfs_fs()) != 0)
 		goto out;
 	return 0;
 out:
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
-#ifdef CONFIG_NFS_DIRECTIO
 	nfs_destroy_directcache();
 out0:
-#endif
 	nfs_destroy_writepagecache();
 out1:
 	nfs_destroy_readpagecache();
@@ -2347,9 +1190,7 @@ out4:
 
 static void __exit exit_nfs_fs(void)
 {
-#ifdef CONFIG_NFS_DIRECTIO
 	nfs_destroy_directcache();
-#endif
 	nfs_destroy_writepagecache();
 	nfs_destroy_readpagecache();
 	nfs_destroy_inodecache();
@@ -2357,8 +1198,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
-	unregister_filesystem(&nfs_fs_type);
-	unregister_nfs4fs();
+	unregister_nfs_fs();
 }
 
 /* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 000000000000..bd2815e2dec1
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,186 @@
+/*
+ * NFS internal definitions
+ */
+
+#include <linux/mount.h>
+
+struct nfs_clone_mount {
+	const struct super_block *sb;
+	const struct dentry *dentry;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+	char *hostname;
+	char *mnt_path;
+	struct sockaddr_in *addr;
+	rpc_authflavor_t authflavor;
+};
+
+/* namespace-nfs4.c */
+#ifdef CONFIG_NFS_V4
+extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+#else
+static inline
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	return ERR_PTR(-ENOENT);
+}
+#endif
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void __exit nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void __exit nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void __exit nfs_destroy_writepagecache(void);
+
+#ifdef CONFIG_NFS_DIRECTIO
+extern int __init nfs_init_directcache(void);
+extern void __exit nfs_destroy_directcache(void);
+#else
+#define nfs_init_directcache() (0)
+#define nfs_destroy_directcache() do {} while(0)
+#endif
+
+/* nfs2xdr.c */
+extern struct rpc_procinfo nfs_procedures[];
+extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
+
+/* nfs3xdr.c */
+extern struct rpc_procinfo nfs3_procedures[];
+extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
+
+/* nfs4xdr.c */
+extern int nfs_stat_to_errno(int);
+extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+
+/* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
+extern struct rpc_procinfo nfs4_procedures[];
+
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+				  struct nfs4_fs_locations *fs_locations,
+				  struct page *page);
+#endif
+
+/* inode.c */
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_destroy_inode(struct inode *);
+extern int nfs_write_inode(struct inode *,int);
+extern void nfs_clear_inode(struct inode *);
+#ifdef CONFIG_NFS_V4
+extern void nfs4_clear_inode(struct inode *);
+#endif
+
+/* super.c */
+extern struct file_system_type nfs_referral_nfs4_fs_type;
+extern struct file_system_type clone_nfs_fs_type;
+#ifdef CONFIG_NFS_V4
+extern struct file_system_type clone_nfs4_fs_type;
+#endif
+#ifdef CONFIG_PROC_FS
+extern struct rpc_stat nfs_rpcstat;
+#endif
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+
+/* namespace.c */
+extern char *nfs_path(const char *base, const struct dentry *dentry,
+		      char *buffer, ssize_t buflen);
+
+/*
+ * Determine the mount path as a string
+ */
+static inline char *
+nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+#ifdef CONFIG_NFS_V4
+	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+#else
+	return NULL;
+#endif
+}
+
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+			 const struct dentry *dentry,
+			 char *buffer, ssize_t buflen)
+{
+	return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+}
+
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+	/* make sure blocksize is a power of two */
+	if ((bsize & (bsize - 1)) || nrbitsp) {
+		unsigned char	nrbits;
+
+		for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+			;
+		bsize = 1 << nrbits;
+		if (nrbitsp)
+			*nrbitsp = nrbits;
+	}
+
+	return bsize;
+}
+
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline unsigned long nfs_calc_block_size(u64 tsize)
+{
+	loff_t used = (tsize + 511) >> 9;
+	return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+	if (bsize < NFS_MIN_FILE_IO_SIZE)
+		bsize = NFS_DEF_FILE_IO_SIZE;
+	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+		bsize = NFS_MAX_FILE_IO_SIZE;
+
+	return nfs_block_bits(bsize, nrbitsp);
+}
+
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+	sb->s_maxbytes = (loff_t)maxfilesize;
+	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
+/*
+ * Check if the string represents a "valid" IPv4 address
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 000000000000..19b98ca468eb
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,229 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/config.h>
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+static void nfs_expire_automounts(void *list);
+
+LIST_HEAD(nfs_automount_list);
+static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - arbitrary string to prepend to the path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the path from the
+ * root dentry to an arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition.
+ */
+char *nfs_path(const char *base, const struct dentry *dentry,
+	       char *buffer, ssize_t buflen)
+{
+	char *end = buffer+buflen;
+	int namelen;
+
+	*--end = '\0';
+	buflen--;
+	spin_lock(&dcache_lock);
+	while (!IS_ROOT(dentry)) {
+		namelen = dentry->d_name.len;
+		buflen -= namelen + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= namelen;
+		memcpy(end, dentry->d_name.name, namelen);
+		*--end = '/';
+		dentry = dentry->d_parent;
+	}
+	spin_unlock(&dcache_lock);
+	namelen = strlen(base);
+	/* Strip off excess slashes in base string */
+	while (namelen > 0 && base[namelen - 1] == '/')
+		namelen--;
+	buflen -= namelen;
+	if (buflen < 0)
+		goto Elong;
+	end -= namelen;
+	memcpy(end, base, namelen);
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+/*
+ * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * @dentry - dentry of mountpoint
+ * @nd - nameidata info
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vfsmount *mnt;
+	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+	struct dentry *parent;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	int err;
+
+	BUG_ON(IS_ROOT(dentry));
+	dprintk("%s: enter\n", __FUNCTION__);
+	dput(nd->dentry);
+	nd->dentry = dget(dentry);
+	if (d_mountpoint(nd->dentry))
+		goto out_follow;
+	/* Look it up again */
+	parent = dget_parent(nd->dentry);
+	err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+	dput(parent);
+	if (err != 0)
+		goto out_err;
+
+	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+		mnt = nfs_do_refmount(nd->mnt, nd->dentry);
+	else
+		mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+	err = PTR_ERR(mnt);
+	if (IS_ERR(mnt))
+		goto out_err;
+
+	mntget(mnt);
+	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list);
+	if (err < 0) {
+		mntput(mnt);
+		if (err == -EBUSY)
+			goto out_follow;
+		goto out_err;
+	}
+	mntput(nd->mnt);
+	dput(nd->dentry);
+	nd->mnt = mnt;
+	nd->dentry = dget(mnt->mnt_root);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+out:
+	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+	return ERR_PTR(err);
+out_err:
+	path_release(nd);
+	goto out;
+out_follow:
+	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+		;
+	err = 0;
+	goto out;
+}
+
+struct inode_operations nfs_mountpoint_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+	.getattr	= nfs_getattr,
+};
+
+struct inode_operations nfs_referral_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+};
+
+static void nfs_expire_automounts(void *data)
+{
+	struct list_head *list = (struct list_head *)data;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+	if (list_empty(&nfs_automount_list)) {
+		cancel_delayed_work(&nfs_automount_task);
+		flush_scheduled_work();
+	}
+}
+
+/*
+ * Clone a mountpoint of the appropriate type
+ */
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname,
+					   struct nfs_clone_mount *mountdata)
+{
+#ifdef CONFIG_NFS_V4
+	struct vfsmount *mnt = NULL;
+	switch (server->rpc_ops->version) {
+		case 2:
+		case 3:
+			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+			break;
+		case 4:
+			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
+	}
+	return mnt;
+#else
+	return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+#endif
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ *
+ */
+struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+		const struct dentry *dentry, struct nfs_fh *fh,
+		struct nfs_fattr *fattr)
+{
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.fh = fh,
+		.fattr = fattr,
+	};
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *devname;
+
+	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	if (page == NULL)
+		goto out;
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	mnt = (struct vfsmount *)devname;
+	if (IS_ERR(devname))
+		goto free_page;
+	mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+free_page:
+	free_page((unsigned long)page);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f0015fa876e1..67391eef6b93 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,12 +23,11 @@
 #include <linux/nfs.h>
 #include <linux/nfs2.h>
 #include <linux/nfs_fs.h>
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
 
-extern int			nfs_stat_to_errno(int stat);
-
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO		EIO
 
@@ -131,7 +130,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	fattr->du.nfs2.blocksize = ntohl(*p++);
 	rdev = ntohl(*p++);
 	fattr->du.nfs2.blocks = ntohl(*p++);
-	fattr->fsid_u.nfs3 = ntohl(*p++);
+	fattr->fsid.major = ntohl(*p++);
+	fattr->fsid.minor = 0;
 	fattr->fileid = ntohl(*p++);
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 33287879bd23..7322da4d2055 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -172,8 +172,10 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
 		inode->i_ino, acl, dfacl);
 	spin_lock(&inode->i_lock);
 	__nfs3_forget_cached_acls(NFS_I(inode));
-	nfsi->acl_access = posix_acl_dup(acl);
-	nfsi->acl_default = posix_acl_dup(dfacl);
+	if (!IS_ERR(acl))
+		nfsi->acl_access = posix_acl_dup(acl);
+	if (!IS_ERR(dfacl))
+		nfsi->acl_default = posix_acl_dup(dfacl);
 	spin_unlock(&inode->i_lock);
 }
 
@@ -254,7 +256,9 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 			res.acl_access = NULL;
 		}
 	}
-	nfs3_cache_acls(inode, res.acl_access, res.acl_default);
+	nfs3_cache_acls(inode,
+		(res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL),
+		(res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
@@ -329,6 +333,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	switch (status) {
 		case 0:
 			status = nfs_refresh_inode(inode, &fattr);
+			nfs3_cache_acls(inode, acl, dfacl);
 			break;
 		case -EPFNOSUPPORT:
 		case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cf186f0d2b3b..7143b1f82cea 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -20,11 +20,10 @@
 #include <linux/nfs_mount.h>
 
 #include "iostat.h"
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-extern struct rpc_procinfo nfs3_procedures[];
-
 /* A wrapper to handle the EJUKEBOX error message */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
@@ -809,8 +808,6 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return status;
 }
 
-extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
-
 static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ec233619687e..0250269e9753 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -22,14 +22,13 @@
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfsacl.h>
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO		EIO
 
-extern int			nfs_stat_to_errno(int);
-
 /*
  * Declare the space requirements for NFS arguments and replies as
  * number of 32bit-words
@@ -166,7 +165,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
 		fattr->rdev = 0;
 
-	p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3);
+	p = xdr_decode_hyper(p, &fattr->fsid.major);
+	fattr->fsid.minor = 0;
 	p = xdr_decode_hyper(p, &fattr->fileid);
 	p = xdr_decode_time3(p, &fattr->atime);
 	p = xdr_decode_time3(p, &fattr->mtime);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0f5e4e7cddec..9a102860df37 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,9 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs4_fs_locations *fs_locations, struct page *page);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
@@ -225,6 +228,7 @@ extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
 extern const u32 nfs4_pathconf_bitmap[2];
 extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[2];
 
 /* nfs4renewd.c */
 extern void nfs4_schedule_state_renewal(struct nfs4_client *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 000000000000..ea38d27b74e6
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,201 @@
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFSv4 namespace
+ */
+
+#include <linux/config.h>
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/*
+ * Check if fs_root is valid
+ */
+static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
+					 char *buffer, ssize_t buflen)
+{
+	char *end = buffer + buflen;
+	int n;
+
+	*--end = '\0';
+	buflen--;
+
+	n = pathname->ncomponents;
+	while (--n >= 0) {
+		struct nfs4_string *component = &pathname->components[n];
+		buflen -= component->len + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= component->len;
+		memcpy(end, component->data, component->len);
+		*--end = '/';
+	}
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fspath - fs path returned in fs_locations
+ * @mntpath - mount path to new server
+ * @hostname - hostname of new server
+ * @addr - host addr of new server
+ *
+ */
+static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+					    const struct dentry *dentry,
+					    struct nfs4_fs_locations *locations)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+	};
+	char *page, *page2;
+	char *path, *fs_path;
+	char *devname;
+	int loc, s;
+
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/* Ensure fs path is a prefix of current dentry path */
+	page = (char *) __get_free_page(GFP_USER);
+	if (page == NULL)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (page2 == NULL)
+		goto out;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		goto out_free;
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		goto out_free;
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
+		goto out_free;
+	}
+
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	if (IS_ERR(devname)) {
+		mnt = (struct vfsmount *)devname;
+		goto out_free;
+	}
+
+	loc = 0;
+	while (loc < locations->nlocations && IS_ERR(mnt)) {
+		struct nfs4_fs_location *location = &locations->locations[loc];
+		char *mnt_path;
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0) {
+			loc++;
+			continue;
+		}
+
+		mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+		if (IS_ERR(mnt_path)) {
+			loc++;
+			continue;
+		}
+		mountdata.mnt_path = mnt_path;
+
+		s = 0;
+		while (s < location->nservers) {
+			struct sockaddr_in addr = {};
+
+			if (location->servers[s].len <= 0 ||
+			    valid_ipaddr4(location->servers[s].data) < 0) {
+				s++;
+				continue;
+			}
+
+			mountdata.hostname = location->servers[s].data;
+			addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+			addr.sin_family = AF_INET;
+			addr.sin_port = htons(NFS_PORT);
+			mountdata.addr = &addr;
+
+			mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+			if (!IS_ERR(mnt)) {
+				break;
+			}
+			s++;
+		}
+		loc++;
+	}
+
+out_free:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ * @nd - nameidata info
+ *
+ */
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct dentry *parent;
+	struct nfs4_fs_locations *fs_locations = NULL;
+	struct page *page;
+	int err;
+
+	/* BUG_ON(IS_ROOT(dentry)); */
+	dprintk("%s: enter\n", __FUNCTION__);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+
+	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (fs_locations == NULL)
+		goto out_free;
+
+	/* Get locations */
+	parent = dget_parent(dentry);
+	dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+	err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
+	dput(parent);
+	if (err != 0 || fs_locations->nlocations <= 0 ||
+	    fs_locations->fs_path.ncomponents <= 0)
+		goto out_free;
+
+	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+out_free:
+	__free_page(page);
+	kfree(fs_locations);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d86c0db7b1e8..b4916b092194 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -65,8 +65,6 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *)
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
-extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
-extern struct rpc_procinfo nfs4_procedures[];
 
 /* Prevent leaks of NFSv4 errors into userland */
 int nfs4_map_errors(int err)
@@ -121,6 +119,25 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
 			0
 };
 
+const u32 nfs4_fs_locations_bitmap[2] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_SIZE
+	| FATTR4_WORD0_FSID
+	| FATTR4_WORD0_FILEID
+	| FATTR4_WORD0_FS_LOCATIONS,
+	FATTR4_WORD1_MODE
+	| FATTR4_WORD1_NUMLINKS
+	| FATTR4_WORD1_OWNER
+	| FATTR4_WORD1_OWNER_GROUP
+	| FATTR4_WORD1_RAWDEV
+	| FATTR4_WORD1_SPACE_USED
+	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_METADATA
+	| FATTR4_WORD1_TIME_MODIFY
+	| FATTR4_WORD1_MOUNTED_ON_FILEID
+};
+
 static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
 		struct nfs4_readdir_arg *readdir)
 {
@@ -185,15 +202,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 	spin_unlock(&clp->cl_lock);
 }
 
-static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_inode *nfsi = NFS_I(dir);
 
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_lock(&dir->i_lock);
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
 	if (cinfo->before == nfsi->change_attr && cinfo->atomic)
 		nfsi->change_attr = cinfo->after;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&dir->i_lock);
 }
 
 struct nfs4_opendata {
@@ -1331,7 +1348,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	return status;
 }
 
-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 	struct nfs4_exception exception = { };
 	int err;
@@ -1443,6 +1460,50 @@ out:
 	return nfs4_map_errors(status);
 }
 
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+{
+	int status = -ENOMEM;
+	struct page *page = NULL;
+	struct nfs4_fs_locations *locations = NULL;
+	struct dentry dentry = {};
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (locations == NULL)
+		goto out;
+
+	dentry.d_name.name = name->name;
+	dentry.d_name.len = name->len;
+	status = nfs4_proc_fs_locations(dir, &dentry, locations, page);
+	if (status != 0)
+		goto out;
+	/* Make sure server returned a different fsid for the referral */
+	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+		dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+		status = -EIO;
+		goto out;
+	}
+
+	memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+	fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
+	if (!fattr->mode)
+		fattr->mode = S_IFDIR;
+	memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+	if (page)
+		__free_page(page);
+	if (locations)
+		kfree(locations);
+	return status;
+}
+
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs4_getattr_arg args = {
@@ -1547,6 +1608,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
 	
 	dprintk("NFS call  lookup %s\n", name->name);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	if (status == -NFS4ERR_MOVED)
+		status = nfs4_get_referral(dir, name, fattr, fhandle);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
@@ -2008,7 +2071,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, res.dir_attr);
-		nfs_refresh_inode(inode, res.fattr);
+		nfs_post_op_update_inode(inode, res.fattr);
 	}
 
 	return status;
@@ -3570,6 +3633,36 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 	return len;
 }
 
+int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs4_fs_locations *fs_locations, struct page *page)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+		[1] = FATTR4_WORD1_MOUNTED_ON_FILEID,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name = &dentry->d_name,
+		.page = page,
+		.bitmask = bitmask,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp = &args,
+		.rpc_resp = fs_locations,
+	};
+	int status;
+
+	dprintk("%s: start\n", __FUNCTION__);
+	fs_locations->fattr.valid = 0;
+	fs_locations->server = server;
+	fs_locations->nlocations = 0;
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+	return status;
+}
+
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
 	.recover_open	= nfs4_open_reclaim,
 	.recover_lock	= nfs4_lock_reclaim,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c5d70efe720..1750d996f49f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int);
 #define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
 				decode_putfh_maxsz + \
 				op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define NFS4_enc_fs_locations_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_fs_locations_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_putfh_maxsz + \
+				 op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz)
 
 static struct {
 	unsigned int	mode;
@@ -722,6 +731,13 @@ static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
 			bitmask[1] & nfs4_fsinfo_bitmap[1]);
 }
 
+static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+{
+	return encode_getattr_two(xdr,
+				  bitmask[0] & nfs4_fs_locations_bitmap[0],
+				  bitmask[1] & nfs4_fs_locations_bitmap[1]);
+}
+
 static int encode_getfh(struct xdr_stream *xdr)
 {
 	uint32_t *p;
@@ -2003,6 +2019,38 @@ out:
 }
 
 /*
+ * Encode FS_LOCATIONS request
+ */
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.nops = 3,
+	};
+	struct rpc_auth *auth = req->rq_task->tk_auth;
+	int replen;
+	int status;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, &hdr);
+	if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+		goto out;
+	if ((status = encode_lookup(&xdr, args->name)) != 0)
+		goto out;
+	if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
+		goto out;
+	/* set up reply
+	 *   toplevel_status + OP_PUTFH + status
+	 *   + OP_LOOKUP + status + OP_GETATTR + status = 7
+	 */
+	replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
+	xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
+			0, PAGE_SIZE);
+out:
+	return status;
+}
+
+/*
  * START OF "GENERIC" DECODE ROUTINES.
  *   These may look a little ugly since they are imported from a "generic"
  * set of XDR encode/decode routines which are intended to be shared by
@@ -2036,7 +2084,7 @@ out:
 	} \
 } while (0)
 
-static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	uint32_t *p;
 
@@ -2087,7 +2135,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
 {
 	uint32_t *p;
-	uint32_t strlen;
+	unsigned int strlen;
 	char *str;
 
 	READ_BUF(12);
@@ -2217,7 +2265,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 	return 0;
 }
 
-static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid)
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
 	uint32_t *p;
 
@@ -2285,6 +2333,22 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	return 0;
 }
 
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+	uint32_t *p;
+
+	*fileid = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+		READ_BUF(8);
+		READ64(*fileid);
+		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+	}
+	dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+	return 0;
+}
+
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
 	uint32_t *p;
@@ -2336,6 +2400,116 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	return status;
 }
 
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+	int n;
+	uint32_t *p;
+	int status = 0;
+
+	READ_BUF(4);
+	READ32(n);
+	if (n < 0)
+		goto out_eio;
+	if (n == 0)
+		goto root_path;
+	dprintk("path ");
+	path->ncomponents = 0;
+	while (path->ncomponents < n) {
+		struct nfs4_string *component = &path->components[path->ncomponents];
+		status = decode_opaque_inline(xdr, &component->len, &component->data);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (path->ncomponents != n)
+			dprintk("/");
+		dprintk("%s", component->data);
+		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
+			path->ncomponents++;
+		else {
+			dprintk("cannot parse %d components in path\n", n);
+			goto out_eio;
+		}
+	}
+out:
+	dprintk("\n");
+	return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+	path->ncomponents = 1;
+	path->components[0].len=0;
+	path->components[0].data=NULL;
+	dprintk("path /\n");
+	goto out;
+out_eio:
+	dprintk(" status %d", status);
+	status = -EIO;
+	goto out;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
+{
+	int n;
+	uint32_t *p;
+	int status = -EIO;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+		goto out;
+	status = 0;
+	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+		goto out;
+	dprintk("%s: fsroot ", __FUNCTION__);
+	status = decode_pathname(xdr, &res->fs_path);
+	if (unlikely(status != 0))
+		goto out;
+	READ_BUF(4);
+	READ32(n);
+	if (n <= 0)
+		goto out_eio;
+	res->nlocations = 0;
+	while (res->nlocations < n) {
+		int m;
+		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
+
+		READ_BUF(4);
+		READ32(m);
+		if (m <= 0)
+			goto out_eio;
+
+		loc->nservers = 0;
+		dprintk("%s: servers ", __FUNCTION__);
+		while (loc->nservers < m) {
+			struct nfs4_string *server = &loc->servers[loc->nservers];
+			status = decode_opaque_inline(xdr, &server->len, &server->data);
+			if (unlikely(status != 0))
+				goto out_eio;
+			dprintk("%s ", server->data);
+			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
+				loc->nservers++;
+			else {
+				int i;
+				dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+				for (i = loc->nservers; i < m; i++) {
+					int len;
+					char *data;
+					status = decode_opaque_inline(xdr, &len, &data);
+					if (unlikely(status != 0))
+						goto out_eio;
+				}
+			}
+		}
+		status = decode_pathname(xdr, &loc->rootpath);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
+			res->nlocations++;
+	}
+out:
+	dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+	return status;
+out_eio:
+	status = -EIO;
+	goto out;
+}
+
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
 	uint32_t *p;
@@ -2841,6 +3015,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		 bitmap[2] = {0},
 		 type;
 	int status, fmode = 0;
+	uint64_t fileid;
 
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
 		goto xdr_error;
@@ -2863,10 +3038,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0)
+	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+						struct nfs4_fs_locations,
+						fattr))) != 0)
+		goto xdr_error;
 	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
@@ -2886,6 +3065,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+		goto xdr_error;
+	if (fattr->fileid == 0 && fileid != 0)
+		fattr->fileid = fileid;
 	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
 		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
 xdr_error:
@@ -3350,8 +3533,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 					attrlen, recvd);
 			return -EINVAL;
 		}
-		if (attrlen <= *acl_len)
-			xdr_read_pages(xdr, attrlen);
+		xdr_read_pages(xdr, attrlen);
 		*acl_len = attrlen;
 	} else
 		status = -EOPNOTSUPP;
@@ -4211,6 +4393,29 @@ out:
 	return status;
 }
 
+/*
+ * FS_LOCATIONS request
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status != 0)
+		goto out;
+	if ((status = decode_putfh(&xdr)) != 0)
+		goto out;
+	if ((status = decode_lookup(&xdr)) != 0)
+		goto out;
+	xdr_enter_page(&xdr, PAGE_SIZE);
+	status = decode_getfattr(&xdr, &res->fattr, res->server);
+out:
+	return status;
+}
+
 uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
@@ -4382,6 +4587,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
+  PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 };
 
 struct rpc_version		nfs_version4 = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 106aca388ebc..ef9429643ebc 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -325,6 +325,7 @@ out:
 
 /**
  * nfs_scan_list - Scan a list for matching requests
+ * @nfsi: NFS inode
  * @head: One of the NFS inode request lists
  * @dst: Destination list
  * @idx_start: lower bound of page->index to scan
@@ -336,14 +337,15 @@ out:
  * The requests are *not* checked to ensure that they form a contiguous set.
  * You must be holding the inode's req_lock when calling this function
  */
-int
-nfs_scan_list(struct list_head *head, struct list_head *dst,
-	      unsigned long idx_start, unsigned int npages)
+int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
+		struct list_head *dst, unsigned long idx_start,
+		unsigned int npages)
 {
-	struct list_head	*pos, *tmp;
-	struct nfs_page		*req;
-	unsigned long		idx_end;
-	int			res;
+	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+	struct nfs_page *req;
+	unsigned long idx_end;
+	int found, i;
+	int res;
 
 	res = 0;
 	if (npages == 0)
@@ -351,25 +353,32 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
 	else
 		idx_end = idx_start + npages - 1;
 
-	list_for_each_safe(pos, tmp, head) {
-
-		req = nfs_list_entry(pos);
-
-		if (req->wb_index < idx_start)
-			continue;
-		if (req->wb_index > idx_end)
+	for (;;) {
+		found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
+				(void **)&pgvec[0], idx_start,
+				NFS_SCAN_MAXENTRIES);
+		if (found <= 0)
 			break;
+		for (i = 0; i < found; i++) {
+			req = pgvec[i];
+			if (req->wb_index > idx_end)
+				goto out;
+			idx_start = req->wb_index + 1;
+			if (req->wb_list_head != head)
+				continue;
+			if (nfs_set_page_writeback_locked(req)) {
+				nfs_list_remove_request(req);
+				nfs_list_add_request(req, dst);
+				res++;
+			}
+		}
 
-		if (!nfs_set_page_writeback_locked(req))
-			continue;
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, dst);
-		res++;
 	}
+out:
 	return res;
 }
 
-int nfs_init_nfspagecache(void)
+int __init nfs_init_nfspagecache(void)
 {
 	nfs_page_cachep = kmem_cache_create("nfs_page",
 					    sizeof(struct nfs_page),
@@ -381,7 +390,7 @@ int nfs_init_nfspagecache(void)
 	return 0;
 }
 
-void nfs_destroy_nfspagecache(void)
+void __exit nfs_destroy_nfspagecache(void)
 {
 	if (kmem_cache_destroy(nfs_page_cachep))
 		printk(KERN_INFO "nfs_page: not all structures were freed\n");
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 9dd85cac2df0..b3899ea3229e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -44,11 +44,10 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/smp_lock.h>
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-extern struct rpc_procinfo nfs_procedures[];
-
 /*
  * Bare-bones access to getattr: this is for nfs_read_super.
  */
@@ -611,8 +610,6 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return 0;
 }
 
-extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
-
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
 	if (task->tk_status >= 0) {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 624ca7146b6b..41c2ffee24f5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,14 +51,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_rdata_mempool);
 				p = NULL;
 			}
@@ -104,6 +101,28 @@ int nfs_return_empty_page(struct page *page)
 	return 0;
 }
 
+static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
+{
+	unsigned int remainder = data->args.count - data->res.count;
+	unsigned int base = data->args.pgbase + data->res.count;
+	unsigned int pglen;
+	struct page **pages;
+
+	if (data->res.eof == 0 || remainder == 0)
+		return;
+	/*
+	 * Note: "remainder" can never be negative, since we check for
+	 * 	this in the XDR code.
+	 */
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	pglen = PAGE_CACHE_SIZE - base;
+	if (pglen < remainder)
+		memclear_highpage_flush(*pages, base, pglen);
+	else
+		memclear_highpage_flush(*pages, base, remainder);
+}
+
 /*
  * Read a page synchronously.
  */
@@ -177,11 +196,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 	spin_unlock(&inode->i_lock);
 
-	if (count)
-		memclear_highpage_flush(page, rdata->args.pgbase, count);
-	SetPageUptodate(page);
-	if (PageError(page))
-		ClearPageError(page);
+	nfs_readpage_truncate_uninitialised_page(rdata);
+	if (rdata->res.eof || rdata->res.count == rdata->args.count)
+		SetPageUptodate(page);
 	result = 0;
 
 io_error:
@@ -436,20 +453,12 @@ static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
 	struct nfs_page *req = data->req;
 	struct page *page = req->wb_page;
  
+	if (likely(task->tk_status >= 0))
+		nfs_readpage_truncate_uninitialised_page(data);
+	else
+		SetPageError(page);
 	if (nfs_readpage_result(task, data) != 0)
 		return;
-	if (task->tk_status >= 0) {
-		unsigned int request = data->args.count;
-		unsigned int result = data->res.count;
-
-		if (result < request) {
-			memclear_highpage_flush(page,
-						data->args.pgbase + result,
-						request - result);
-		}
-	} else
-		SetPageError(page);
-
 	if (atomic_dec_and_test(&req->wb_complete)) {
 		if (!PageError(page))
 			SetPageUptodate(page);
@@ -462,6 +471,40 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
 	.rpc_release = nfs_readdata_release,
 };
 
+static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
+{
+	unsigned int count = data->res.count;
+	unsigned int base = data->args.pgbase;
+	struct page **pages;
+
+	if (unlikely(count == 0))
+		return;
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	count += base;
+	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+		SetPageUptodate(*pages);
+	/*
+	 * Was this an eof or a short read? If the latter, don't mark the page
+	 * as uptodate yet.
+	 */
+	if (count > 0 && (data->res.eof || data->args.count == data->res.count))
+		SetPageUptodate(*pages);
+}
+
+static void nfs_readpage_set_pages_error(struct nfs_read_data *data)
+{
+	unsigned int count = data->args.count;
+	unsigned int base = data->args.pgbase;
+	struct page **pages;
+
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	count += base;
+	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+		SetPageError(*pages);
+}
+
 /*
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
@@ -469,27 +512,24 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
 static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
-	unsigned int count = data->res.count;
 
+	/*
+	 * Note: nfs_readpage_result may change the values of
+	 * data->args. In the multi-page case, we therefore need
+	 * to ensure that we call the next nfs_readpage_set_page_uptodate()
+	 * first in the multi-page case.
+	 */
+	if (likely(task->tk_status >= 0)) {
+		nfs_readpage_truncate_uninitialised_page(data);
+		nfs_readpage_set_pages_uptodate(data);
+	} else
+		nfs_readpage_set_pages_error(data);
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 	while (!list_empty(&data->pages)) {
 		struct nfs_page *req = nfs_list_entry(data->pages.next);
-		struct page *page = req->wb_page;
-		nfs_list_remove_request(req);
 
-		if (task->tk_status >= 0) {
-			if (count < PAGE_CACHE_SIZE) {
-				if (count < req->wb_bytes)
-					memclear_highpage_flush(page,
-							req->wb_pgbase + count,
-							req->wb_bytes - count);
-				count = 0;
-			} else
-				count -= PAGE_CACHE_SIZE;
-			SetPageUptodate(page);
-		} else
-			SetPageError(page);
+		nfs_list_remove_request(req);
 		nfs_readpage_release(req);
 	}
 }
@@ -654,7 +694,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	return ret;
 }
 
-int nfs_init_readpagecache(void)
+int __init nfs_init_readpagecache(void)
 {
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
 					     sizeof(struct nfs_read_data),
@@ -671,7 +711,7 @@ int nfs_init_readpagecache(void)
 	return 0;
 }
 
-void nfs_destroy_readpagecache(void)
+void __exit nfs_destroy_readpagecache(void)
 {
 	mempool_destroy(nfs_rdata_mempool);
 	if (kmem_cache_destroy(nfs_rdata_cachep))
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 000000000000..e8a9bee74d9d
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,1537 @@
+/*
+ *  linux/fs/nfs/super.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs superblock handling functions
+ *
+ *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ *  Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
+
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version * nfs_version[] = {
+	NULL,
+	NULL,
+	&nfs_version2,
+#if defined(CONFIG_NFS_V3)
+	&nfs_version3,
+#elif defined(CONFIG_NFS_V4)
+	NULL,
+#endif
+#if defined(CONFIG_NFS_V4)
+	&nfs_version4,
+#endif
+};
+
+static struct rpc_program nfs_program = {
+	.name			= "nfs",
+	.number			= NFS_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfs_version),
+	.version		= nfs_version,
+	.stats			= &nfs_rpcstat,
+	.pipe_dir_name		= "/nfs",
+};
+
+struct rpc_stat nfs_rpcstat = {
+	.program		= &nfs_program
+};
+
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *	nfsacl_version[] = {
+	[3]			= &nfsacl_version3,
+};
+
+struct rpc_program		nfsacl_program = {
+	.name =			"nfsacl",
+	.number =		NFS_ACL_PROGRAM,
+	.nrvers =		ARRAY_SIZE(nfsacl_version),
+	.version =		nfsacl_version,
+	.stats =		&nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+
+static void nfs_umount_begin(struct vfsmount *, int);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
+static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
+static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
+static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs_kill_super(struct super_block *);
+
+static struct file_system_type nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_get_sb,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type clone_nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs_sb,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static struct super_operations nfs_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs_write_inode,
+	.statfs		= nfs_statfs,
+	.clear_inode	= nfs_clear_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_stats	= nfs_show_stats,
+};
+
+#ifdef CONFIG_NFS_V4
+static int nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs4_kill_super(struct super_block *sb);
+
+static struct file_system_type nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs4_get_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type clone_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs_clone_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type nfs_referral_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs_referral_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static struct super_operations nfs4_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs_write_inode,
+	.statfs		= nfs_statfs,
+	.clear_inode	= nfs4_clear_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_stats	= nfs_show_stats,
+};
+#endif
+
+#ifdef CONFIG_NFS_V4
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+
+static int param_set_port(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+		return -EINVAL;
+	*((int *)kp->arg) = num;
+	return 0;
+}
+
+module_param_call(callback_tcpport, param_set_port, param_get_int,
+		 &nfs_callback_set_tcpport, 0644);
+#endif
+
+#ifdef CONFIG_NFS_V4
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	int jif = num * HZ;
+	if (endp == val || *endp || num < 0 || jif < num)
+		return -EINVAL;
+	*((int *)kp->arg) = jif;
+	return 0;
+}
+
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+		 &nfs_idmap_cache_timeout, 0644);
+#endif
+
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+	int ret;
+
+        ret = register_filesystem(&nfs_fs_type);
+	if (ret < 0)
+		goto error_0;
+
+#ifdef CONFIG_NFS_V4
+	ret = nfs_register_sysctl();
+	if (ret < 0)
+		goto error_1;
+	ret = register_filesystem(&nfs4_fs_type);
+	if (ret < 0)
+		goto error_2;
+#endif
+	return 0;
+
+#ifdef CONFIG_NFS_V4
+error_2:
+	nfs_unregister_sysctl();
+error_1:
+	unregister_filesystem(&nfs_fs_type);
+#endif
+error_0:
+	return ret;
+}
+
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+#ifdef CONFIG_NFS_V4
+	unregister_filesystem(&nfs4_fs_type);
+	nfs_unregister_sysctl();
+#endif
+	unregister_filesystem(&nfs_fs_type);
+}
+
+/*
+ * Deliver file system statistics to userspace
+ */
+static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct nfs_server *server = NFS_SB(sb);
+	unsigned char blockbits;
+	unsigned long blockres;
+	struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
+	struct nfs_fattr fattr;
+	struct nfs_fsstat res = {
+			.fattr = &fattr,
+	};
+	int error;
+
+	lock_kernel();
+
+	error = server->rpc_ops->statfs(server, rootfh, &res);
+	buf->f_type = NFS_SUPER_MAGIC;
+	if (error < 0)
+		goto out_err;
+
+	/*
+	 * Current versions of glibc do not correctly handle the
+	 * case where f_frsize != f_bsize.  Eventually we want to
+	 * report the value of wtmult in this field.
+	 */
+	buf->f_frsize = sb->s_blocksize;
+
+	/*
+	 * On most *nix systems, f_blocks, f_bfree, and f_bavail
+	 * are reported in units of f_frsize.  Linux hasn't had
+	 * an f_frsize field in its statfs struct until recently,
+	 * thus historically Linux's sys_statfs reports these
+	 * fields in units of f_bsize.
+	 */
+	buf->f_bsize = sb->s_blocksize;
+	blockbits = sb->s_blocksize_bits;
+	blockres = (1 << blockbits) - 1;
+	buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+	buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+	buf->f_bavail = (res.abytes + blockres) >> blockbits;
+
+	buf->f_files = res.tfiles;
+	buf->f_ffree = res.afiles;
+
+	buf->f_namelen = server->namelen;
+ out:
+	unlock_kernel();
+	return 0;
+
+ out_err:
+	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
+	buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
+	goto out;
+
+}
+
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+	static struct {
+		rpc_authflavor_t flavour;
+		const char *str;
+	} sec_flavours[] = {
+		{ RPC_AUTH_NULL, "null" },
+		{ RPC_AUTH_UNIX, "sys" },
+		{ RPC_AUTH_GSS_KRB5, "krb5" },
+		{ RPC_AUTH_GSS_KRB5I, "krb5i" },
+		{ RPC_AUTH_GSS_KRB5P, "krb5p" },
+		{ RPC_AUTH_GSS_LKEY, "lkey" },
+		{ RPC_AUTH_GSS_LKEYI, "lkeyi" },
+		{ RPC_AUTH_GSS_LKEYP, "lkeyp" },
+		{ RPC_AUTH_GSS_SPKM, "spkm" },
+		{ RPC_AUTH_GSS_SPKMI, "spkmi" },
+		{ RPC_AUTH_GSS_SPKMP, "spkmp" },
+		{ -1, "unknown" }
+	};
+	int i;
+
+	for (i=0; sec_flavours[i].flavour != -1; i++) {
+		if (sec_flavours[i].flavour == flavour)
+			break;
+	}
+	return sec_flavours[i].str;
+}
+
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
+{
+	static struct proc_nfs_info {
+		int flag;
+		char *str;
+		char *nostr;
+	} nfs_info[] = {
+		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
+		{ NFS_MOUNT_INTR, ",intr", "" },
+		{ NFS_MOUNT_NOCTO, ",nocto", "" },
+		{ NFS_MOUNT_NOAC, ",noac", "" },
+		{ NFS_MOUNT_NONLM, ",nolock", "" },
+		{ NFS_MOUNT_NOACL, ",noacl", "" },
+		{ 0, NULL, NULL }
+	};
+	struct proc_nfs_info *nfs_infop;
+	char buf[12];
+	char *proto;
+
+	seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
+	seq_printf(m, ",rsize=%d", nfss->rsize);
+	seq_printf(m, ",wsize=%d", nfss->wsize);
+	if (nfss->acregmin != 3*HZ || showdefaults)
+		seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
+	if (nfss->acregmax != 60*HZ || showdefaults)
+		seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
+	if (nfss->acdirmin != 30*HZ || showdefaults)
+		seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
+	if (nfss->acdirmax != 60*HZ || showdefaults)
+		seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
+	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+		if (nfss->flags & nfs_infop->flag)
+			seq_puts(m, nfs_infop->str);
+		else
+			seq_puts(m, nfs_infop->nostr);
+	}
+	switch (nfss->client->cl_xprt->prot) {
+		case IPPROTO_TCP:
+			proto = "tcp";
+			break;
+		case IPPROTO_UDP:
+			proto = "udp";
+			break;
+		default:
+			snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
+			proto = buf;
+	}
+	seq_printf(m, ",proto=%s", proto);
+	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
+	seq_printf(m, ",retrans=%u", nfss->retrans_count);
+	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+}
+
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+
+	nfs_show_mount_options(m, nfss, 0);
+
+	seq_puts(m, ",addr=");
+	seq_escape(m, nfss->hostname, " \t\n\\");
+
+	return 0;
+}
+
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+{
+	int i, cpu;
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+	struct rpc_auth *auth = nfss->client->cl_auth;
+	struct nfs_iostats totals = { };
+
+	seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+
+	/*
+	 * Display all mount option settings
+	 */
+	seq_printf(m, "\n\topts:\t");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	nfs_show_mount_options(m, nfss, 1);
+
+	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+
+	seq_printf(m, "\n\tcaps:\t");
+	seq_printf(m, "caps=0x%x", nfss->caps);
+	seq_printf(m, ",wtmult=%d", nfss->wtmult);
+	seq_printf(m, ",dtsize=%d", nfss->dtsize);
+	seq_printf(m, ",bsize=%d", nfss->bsize);
+	seq_printf(m, ",namelen=%d", nfss->namelen);
+
+#ifdef CONFIG_NFS_V4
+	if (nfss->rpc_ops->version == 4) {
+		seq_printf(m, "\n\tnfsv4:\t");
+		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+	}
+#endif
+
+	/*
+	 * Display security flavor in effect for this mount
+	 */
+	seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
+	if (auth->au_flavor)
+		seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
+
+	/*
+	 * Display superblock I/O counters
+	 */
+	for_each_possible_cpu(cpu) {
+		struct nfs_iostats *stats;
+
+		preempt_disable();
+		stats = per_cpu_ptr(nfss->io_stats, cpu);
+
+		for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+			totals.events[i] += stats->events[i];
+		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+			totals.bytes[i] += stats->bytes[i];
+
+		preempt_enable();
+	}
+
+	seq_printf(m, "\n\tevents:\t");
+	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+		seq_printf(m, "%lu ", totals.events[i]);
+	seq_printf(m, "\n\tbytes:\t");
+	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+		seq_printf(m, "%Lu ", totals.bytes[i]);
+	seq_printf(m, "\n");
+
+	rpc_print_iostats(m, nfss->client);
+
+	return 0;
+}
+
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to traversals
+ */
+static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+	struct nfs_server *server;
+	struct rpc_clnt	*rpc;
+
+	shrink_submounts(vfsmnt, &nfs_automount_list);
+	if (!(flags & MNT_FORCE))
+		return;
+	/* -EIO all pending I/O */
+	server = NFS_SB(vfsmnt->mnt_sb);
+	rpc = server->client;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+	rpc = server->client_acl;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+}
+
+/*
+ * Obtain the root inode of the file system.
+ */
+static struct inode *
+nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
+{
+	struct nfs_server	*server = NFS_SB(sb);
+	int			error;
+
+	error = server->rpc_ops->getroot(server, rootfh, fsinfo);
+	if (error < 0) {
+		dprintk("nfs_get_root: getattr error = %d\n", -error);
+		return ERR_PTR(error);
+	}
+
+	server->fsid = fsinfo->fattr->fsid;
+	return nfs_fhget(sb, rootfh, fsinfo->fattr);
+}
+
+/*
+ * Do NFS version-independent mount processing, and sanity checking
+ */
+static int
+nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
+{
+	struct nfs_server	*server;
+	struct inode		*root_inode;
+	struct nfs_fattr	fattr;
+	struct nfs_fsinfo	fsinfo = {
+					.fattr = &fattr,
+				};
+	struct nfs_pathconf pathinfo = {
+			.fattr = &fattr,
+	};
+	int no_root_error = 0;
+	unsigned long max_rpc_payload;
+
+	/* We probably want something more informative here */
+	snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+	server = NFS_SB(sb);
+
+	sb->s_magic      = NFS_SUPER_MAGIC;
+
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		return -ENOMEM;
+
+	root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
+	/* Did getting the root inode fail? */
+	if (IS_ERR(root_inode)) {
+		no_root_error = PTR_ERR(root_inode);
+		goto out_no_root;
+	}
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root) {
+		no_root_error = -ENOMEM;
+		goto out_no_root;
+	}
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+
+	/* mount time stamp, in seconds */
+	server->mount_time = jiffies;
+
+	/* Get some general file system info */
+	if (server->namelen == 0 &&
+	    server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
+		server->namelen = pathinfo.max_namelen;
+	/* Work out a lot of parameters */
+	if (server->rsize == 0)
+		server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
+	if (server->wsize == 0)
+		server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
+
+	if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
+		server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
+	if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
+		server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
+
+	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+	if (server->rsize > max_rpc_payload)
+		server->rsize = max_rpc_payload;
+	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+		server->rsize = NFS_MAX_FILE_IO_SIZE;
+	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (server->wsize > max_rpc_payload)
+		server->wsize = max_rpc_payload;
+	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+		server->wsize = NFS_MAX_FILE_IO_SIZE;
+	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (sb->s_blocksize == 0)
+		sb->s_blocksize = nfs_block_bits(server->wsize,
+							 &sb->s_blocksize_bits);
+	server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
+
+	server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
+	if (server->dtsize > PAGE_CACHE_SIZE)
+		server->dtsize = PAGE_CACHE_SIZE;
+	if (server->dtsize > server->rsize)
+		server->dtsize = server->rsize;
+
+	if (server->flags & NFS_MOUNT_NOAC) {
+		server->acregmin = server->acregmax = 0;
+		server->acdirmin = server->acdirmax = 0;
+		sb->s_flags |= MS_SYNCHRONOUS;
+	}
+	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+
+	nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+
+	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
+	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
+
+	/* We're airborne Set socket buffersize */
+	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+	return 0;
+	/* Yargs. It didn't work out. */
+out_no_root:
+	dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
+	if (!IS_ERR(root_inode))
+		iput(root_inode);
+	return no_root_error;
+}
+
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
+{
+	to->to_initval = timeo * HZ / 10;
+	to->to_retries = retrans;
+	if (!to->to_retries)
+		to->to_retries = 2;
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		if (!to->to_initval)
+			to->to_initval = 60 * HZ;
+		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+			to->to_initval = NFS_MAX_TCP_TIMEOUT;
+		to->to_increment = to->to_initval;
+		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+		to->to_exponential = 0;
+		break;
+	case IPPROTO_UDP:
+	default:
+		if (!to->to_initval)
+			to->to_initval = 11 * HZ / 10;
+		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+			to->to_initval = NFS_MAX_UDP_TIMEOUT;
+		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+		to->to_exponential = 1;
+		break;
+	}
+}
+
+/*
+ * Create an RPC client handle.
+ */
+static struct rpc_clnt *
+nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
+{
+	struct rpc_timeout	timeparms;
+	struct rpc_xprt		*xprt = NULL;
+	struct rpc_clnt		*clnt = NULL;
+	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
+
+	nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
+
+	server->retrans_timeo = timeparms.to_initval;
+	server->retrans_count = timeparms.to_retries;
+
+	/* create transport and client */
+	xprt = xprt_create_proto(proto, &server->addr, &timeparms);
+	if (IS_ERR(xprt)) {
+		dprintk("%s: cannot create RPC transport. Error = %ld\n",
+				__FUNCTION__, PTR_ERR(xprt));
+		return (struct rpc_clnt *)xprt;
+	}
+	clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+				 server->rpc_ops->version, data->pseudoflavor);
+	if (IS_ERR(clnt)) {
+		dprintk("%s: cannot create RPC client. Error = %ld\n",
+				__FUNCTION__, PTR_ERR(xprt));
+		goto out_fail;
+	}
+
+	clnt->cl_intr     = 1;
+	clnt->cl_softrtry = 1;
+
+	return clnt;
+
+out_fail:
+	return clnt;
+}
+
+/*
+ * Clone a server record
+ */
+static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct inode *root_inode;
+	struct nfs_fsinfo fsinfo;
+	void *err = ERR_PTR(-ENOMEM);
+
+	sb->s_op = data->sb->s_op;
+	sb->s_blocksize = data->sb->s_blocksize;
+	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+	sb->s_maxbytes = data->sb->s_maxbytes;
+
+	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		goto out;
+
+	server->client = rpc_clone_client(parent->client);
+	if (IS_ERR((err = server->client)))
+		goto out;
+
+	if (!IS_ERR(parent->client_sys)) {
+		server->client_sys = rpc_clone_client(parent->client_sys);
+		if (IS_ERR((err = server->client_sys)))
+			goto out;
+	}
+	if (!IS_ERR(parent->client_acl)) {
+		server->client_acl = rpc_clone_client(parent->client_acl);
+		if (IS_ERR((err = server->client_acl)))
+			goto out;
+	}
+	root_inode = nfs_fhget(sb, data->fh, data->fattr);
+	if (!root_inode)
+		goto out;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_put_root;
+	fsinfo.fattr = data->fattr;
+	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_flags |= MS_ACTIVE;
+	return server;
+out_put_root:
+	iput(root_inode);
+out:
+	return err;
+}
+
+/*
+ * Copy an existing superblock and attach revised data
+ */
+static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
+		struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
+		struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *),
+		struct vfsmount *mnt)
+{
+	struct nfs_server *server;
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct super_block *sb = ERR_PTR(-EINVAL);
+	char *hostname;
+	int error = -ENOMEM;
+	int len;
+
+	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (server == NULL)
+		goto out_err;
+	memcpy(server, parent, sizeof(*server));
+	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
+	len = strlen(hostname) + 1;
+	server->hostname = kmalloc(len, GFP_KERNEL);
+	if (server->hostname == NULL)
+		goto free_server;
+	memcpy(server->hostname, hostname, len);
+	error = rpciod_up();
+	if (error != 0)
+		goto free_hostname;
+
+	sb = fill_sb(server, data);
+	if (IS_ERR(sb)) {
+		error = PTR_ERR(sb);
+		goto kill_rpciod;
+	}
+		
+	if (sb->s_root)
+		goto out_rpciod_down;
+
+	server = fill_server(sb, data);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_deactivate;
+	}
+	return simple_set_mnt(mnt, sb);
+out_deactivate:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	return error;
+out_rpciod_down:
+	rpciod_down();
+	kfree(server->hostname);
+	kfree(server);
+	return simple_set_mnt(mnt, sb);
+kill_rpciod:
+	rpciod_down();
+free_hostname:
+	kfree(server->hostname);
+free_server:
+	kfree(server);
+out_err:
+	return error;
+}
+
+/*
+ * Set up an NFS2/3 superblock
+ *
+ * The way this works is that the mount process passes a structure
+ * in the data argument which contains the server's IP address
+ * and the root file handle obtained from the server's mount
+ * daemon. We stash these away in the private superblock fields.
+ */
+static int
+nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
+{
+	struct nfs_server	*server;
+	rpc_authflavor_t	authflavor;
+
+	server           = NFS_SB(sb);
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	if (data->bsize)
+		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+	server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
+
+	server->acregmin = data->acregmin*HZ;
+	server->acregmax = data->acregmax*HZ;
+	server->acdirmin = data->acdirmin*HZ;
+	server->acdirmax = data->acdirmax*HZ;
+
+	/* Start lockd here, before we might error out */
+	if (!(server->flags & NFS_MOUNT_NONLM))
+		lockd_up();
+
+	server->namelen  = data->namlen;
+	server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
+	if (!server->hostname)
+		return -ENOMEM;
+	strcpy(server->hostname, data->hostname);
+
+	/* Check NFS protocol revision and initialize RPC op vector
+	 * and file handle pool. */
+#ifdef CONFIG_NFS_V3
+	if (server->flags & NFS_MOUNT_VER3) {
+		server->rpc_ops = &nfs_v3_clientops;
+		server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		server->rpc_ops = &nfs_v2_clientops;
+	}
+#else
+	server->rpc_ops = &nfs_v2_clientops;
+#endif
+
+	/* Fill in pseudoflavor for mount version < 5 */
+	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
+		data->pseudoflavor = RPC_AUTH_UNIX;
+	authflavor = data->pseudoflavor;	/* save for sb_init() */
+	/* XXX maybe we want to add a server->pseudoflavor field */
+
+	/* Create RPC client handles */
+	server->client = nfs_create_client(server, data);
+	if (IS_ERR(server->client))
+		return PTR_ERR(server->client);
+	/* RFC 2623, sec 2.3.2 */
+	if (authflavor != RPC_AUTH_UNIX) {
+		struct rpc_auth *auth;
+
+		server->client_sys = rpc_clone_client(server->client);
+		if (IS_ERR(server->client_sys))
+			return PTR_ERR(server->client_sys);
+		auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
+		if (IS_ERR(auth))
+			return PTR_ERR(auth);
+	} else {
+		atomic_inc(&server->client->cl_count);
+		server->client_sys = server->client;
+	}
+	if (server->flags & NFS_MOUNT_VER3) {
+#ifdef CONFIG_NFS_V3_ACL
+		if (!(server->flags & NFS_MOUNT_NOACL)) {
+			server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+			/* No errors! Assume that Sun nfsacls are supported */
+			if (!IS_ERR(server->client_acl))
+				server->caps |= NFS_CAP_ACLS;
+		}
+#else
+		server->flags &= ~NFS_MOUNT_NOACL;
+#endif /* CONFIG_NFS_V3_ACL */
+		/*
+		 * The VFS shouldn't apply the umask to mode bits. We will
+		 * do so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
+		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+			server->namelen = NFS3_MAXNAMLEN;
+		sb->s_time_gran = 1;
+	} else {
+		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+			server->namelen = NFS2_MAXNAMLEN;
+	}
+
+	sb->s_op = &nfs_sops;
+	return nfs_sb_init(sb, authflavor);
+}
+
+static int nfs_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return set_anon_super(s, data);
+}
+
+static int nfs_compare_super(struct super_block *sb, void *data)
+{
+	struct nfs_server *server = data;
+	struct nfs_server *old = NFS_SB(sb);
+
+	if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
+		return 0;
+	if (old->addr.sin_port != server->addr.sin_port)
+		return 0;
+	return !nfs_compare_fh(&old->fh, &server->fh);
+}
+
+static int nfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+	int error;
+	struct nfs_server *server = NULL;
+	struct super_block *s;
+	struct nfs_fh *root;
+	struct nfs_mount_data *data = raw_data;
+
+	error = -EINVAL;
+	if (data == NULL) {
+		dprintk("%s: missing data argument\n", __FUNCTION__);
+		goto out_err_noserver;
+	}
+	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
+		dprintk("%s: bad mount version\n", __FUNCTION__);
+		goto out_err_noserver;
+	}
+	switch (data->version) {
+		case 1:
+			data->namlen = 0;
+		case 2:
+			data->bsize  = 0;
+		case 3:
+			if (data->flags & NFS_MOUNT_VER3) {
+				dprintk("%s: mount structure version %d does not support NFSv3\n",
+						__FUNCTION__,
+						data->version);
+				goto out_err_noserver;
+			}
+			data->root.size = NFS2_FHSIZE;
+			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+		case 4:
+			if (data->flags & NFS_MOUNT_SECFLAVOUR) {
+				dprintk("%s: mount structure version %d does not support strong security\n",
+						__FUNCTION__,
+						data->version);
+				goto out_err_noserver;
+			}
+		case 5:
+			memset(data->context, 0, sizeof(data->context));
+	}
+#ifndef CONFIG_NFS_V3
+	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
+	error = -EPROTONOSUPPORT;
+	if (data->flags & NFS_MOUNT_VER3) {
+		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
+		goto out_err_noserver;
+	}
+#endif /* CONFIG_NFS_V3 */
+
+	error = -ENOMEM;
+	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (!server)
+		goto out_err_noserver;
+	/* Zero out the NFS state stuff */
+	init_nfsv4_state(server);
+	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+
+	root = &server->fh;
+	if (data->flags & NFS_MOUNT_VER3)
+		root->size = data->root.size;
+	else
+		root->size = NFS2_FHSIZE;
+	error = -EINVAL;
+	if (root->size > sizeof(root->data)) {
+		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
+		goto out_err;
+	}
+	memcpy(root->data, data->root.data, root->size);
+
+	/* We now require that the mount process passes the remote address */
+	memcpy(&server->addr, &data->addr, sizeof(server->addr));
+	if (server->addr.sin_addr.s_addr == INADDR_ANY) {
+		dprintk("%s: mount program didn't pass remote address!\n",
+				__FUNCTION__);
+		goto out_err;
+	}
+
+	/* Fire up rpciod if not yet running */
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
+		goto out_err;
+	}
+
+	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_rpciod;
+	}
+
+	if (s->s_root)
+		goto out_rpciod_down;
+
+	s->s_flags = flags;
+
+	error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+	if (error) {
+		up_write(&s->s_umount);
+		deactivate_super(s);
+		return error;
+	}
+	s->s_flags |= MS_ACTIVE;
+	return simple_set_mnt(mnt, s);
+
+out_rpciod_down:
+	rpciod_down();
+	kfree(server);
+	return simple_set_mnt(mnt, s);
+
+out_err_rpciod:
+	rpciod_down();
+out_err:
+	kfree(server);
+out_err_noserver:
+	return error;
+}
+
+static void nfs_kill_super(struct super_block *s)
+{
+	struct nfs_server *server = NFS_SB(s);
+
+	kill_anon_super(s);
+
+	if (!IS_ERR(server->client))
+		rpc_shutdown_client(server->client);
+	if (!IS_ERR(server->client_sys))
+		rpc_shutdown_client(server->client_sys);
+	if (!IS_ERR(server->client_acl))
+		rpc_shutdown_client(server->client_acl);
+
+	if (!(server->flags & NFS_MOUNT_NONLM))
+		lockd_down();	/* release rpc.lockd */
+
+	rpciod_down();		/* release rpciod */
+
+	nfs_free_iostats(server->io_stats);
+	kfree(server->hostname);
+	kfree(server);
+	nfs_release_automount_timer();
+}
+
+static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb;
+
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
+	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
+		lockd_up();
+	return sb;
+}
+
+static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt);
+}
+
+#ifdef CONFIG_NFS_V4
+static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
+	struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
+{
+	struct nfs4_client *clp;
+	struct rpc_xprt *xprt = NULL;
+	struct rpc_clnt *clnt = NULL;
+	int err = -EIO;
+
+	clp = nfs4_get_client(&server->addr.sin_addr);
+	if (!clp) {
+		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
+		return ERR_PTR(err);
+	}
+
+	/* Now create transport and client */
+	down_write(&clp->cl_sem);
+	if (IS_ERR(clp->cl_rpcclient)) {
+		xprt = xprt_create_proto(proto, &server->addr, timeparms);
+		if (IS_ERR(xprt)) {
+			up_write(&clp->cl_sem);
+			err = PTR_ERR(xprt);
+			dprintk("%s: cannot create RPC transport. Error = %d\n",
+					__FUNCTION__, err);
+			goto out_fail;
+		}
+		/* Bind to a reserved port! */
+		xprt->resvport = 1;
+		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+				server->rpc_ops->version, flavor);
+		if (IS_ERR(clnt)) {
+			up_write(&clp->cl_sem);
+			err = PTR_ERR(clnt);
+			dprintk("%s: cannot create RPC client. Error = %d\n",
+					__FUNCTION__, err);
+			goto out_fail;
+		}
+		clnt->cl_intr     = 1;
+		clnt->cl_softrtry = 1;
+		clp->cl_rpcclient = clnt;
+		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
+		nfs_idmap_new(clp);
+	}
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	clnt = rpc_clone_client(clp->cl_rpcclient);
+	if (!IS_ERR(clnt))
+		server->nfs4_state = clp;
+	up_write(&clp->cl_sem);
+	clp = NULL;
+
+	if (IS_ERR(clnt)) {
+		dprintk("%s: cannot create RPC client. Error = %d\n",
+				__FUNCTION__, err);
+		return clnt;
+	}
+
+	if (server->nfs4_state->cl_idmap == NULL) {
+		dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (clnt->cl_auth->au_flavor != flavor) {
+		struct rpc_auth *auth;
+
+		auth = rpcauth_create(flavor, clnt);
+		if (IS_ERR(auth)) {
+			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+			return (struct rpc_clnt *)auth;
+		}
+	}
+	return clnt;
+
+ out_fail:
+	if (clp)
+		nfs4_put_client(clp);
+	return ERR_PTR(err);
+}
+
+/*
+ * Set up an NFS4 superblock
+ */
+static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
+{
+	struct nfs_server *server;
+	struct rpc_timeout timeparms;
+	rpc_authflavor_t authflavour;
+	int err = -EIO;
+
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	server = NFS_SB(sb);
+	if (data->rsize != 0)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize != 0)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+	server->caps = NFS_CAP_ATOMIC_OPEN;
+
+	server->acregmin = data->acregmin*HZ;
+	server->acregmax = data->acregmax*HZ;
+	server->acdirmin = data->acdirmin*HZ;
+	server->acdirmax = data->acdirmax*HZ;
+
+	server->rpc_ops = &nfs_v4_clientops;
+
+	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
+
+	server->retrans_timeo = timeparms.to_initval;
+	server->retrans_count = timeparms.to_retries;
+
+	/* Now create transport and client */
+	authflavour = RPC_AUTH_UNIX;
+	if (data->auth_flavourlen != 0) {
+		if (data->auth_flavourlen != 1) {
+			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+					__FUNCTION__, data->auth_flavourlen);
+			err = -EINVAL;
+			goto out_fail;
+		}
+		if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
+			err = -EFAULT;
+			goto out_fail;
+		}
+	}
+
+	server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
+	if (IS_ERR(server->client)) {
+		err = PTR_ERR(server->client);
+			dprintk("%s: cannot create RPC client. Error = %d\n",
+					__FUNCTION__, err);
+			goto out_fail;
+	}
+
+	sb->s_time_gran = 1;
+
+	sb->s_op = &nfs4_sops;
+	err = nfs_sb_init(sb, authflavour);
+
+ out_fail:
+	return err;
+}
+
+static int nfs4_compare_super(struct super_block *sb, void *data)
+{
+	struct nfs_server *server = data;
+	struct nfs_server *old = NFS_SB(sb);
+
+	if (strcmp(server->hostname, old->hostname) != 0)
+		return 0;
+	if (strcmp(server->mnt_path, old->mnt_path) != 0)
+		return 0;
+	return 1;
+}
+
+static void *
+nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
+{
+	void *p = NULL;
+
+	if (!src->len)
+		return ERR_PTR(-EINVAL);
+	if (src->len < maxlen)
+		maxlen = src->len;
+	if (dst == NULL) {
+		p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
+		if (p == NULL)
+			return ERR_PTR(-ENOMEM);
+	}
+	if (copy_from_user(dst, src->data, maxlen)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+	dst[maxlen] = '\0';
+	return dst;
+}
+
+static int nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+	int error;
+	struct nfs_server *server;
+	struct super_block *s;
+	struct nfs4_mount_data *data = raw_data;
+	void *p;
+
+	if (data == NULL) {
+		dprintk("%s: missing data argument\n", __FUNCTION__);
+		return -EINVAL;
+	}
+	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
+		dprintk("%s: bad mount version\n", __FUNCTION__);
+		return -EINVAL;
+	}
+
+	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (!server)
+		return -ENOMEM;
+	/* Zero out the NFS state stuff */
+	init_nfsv4_state(server);
+	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+
+	p = nfs_copy_user_string(NULL, &data->hostname, 256);
+	if (IS_ERR(p))
+		goto out_err;
+	server->hostname = p;
+
+	p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
+	if (IS_ERR(p))
+		goto out_err;
+	server->mnt_path = p;
+
+	p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
+			sizeof(server->ip_addr) - 1);
+	if (IS_ERR(p))
+		goto out_err;
+
+	/* We now require that the mount process passes the remote address */
+	if (data->host_addrlen != sizeof(server->addr)) {
+		error = -EINVAL;
+		goto out_free;
+	}
+	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
+		error = -EFAULT;
+		goto out_free;
+	}
+	if (server->addr.sin_family != AF_INET ||
+	    server->addr.sin_addr.s_addr == INADDR_ANY) {
+		dprintk("%s: mount program didn't pass remote IP address!\n",
+				__FUNCTION__);
+		error = -EINVAL;
+		goto out_free;
+	}
+
+	/* Fire up rpciod if not yet running */
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
+		goto out_free;
+	}
+
+	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
+
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_free;
+	}
+
+	if (s->s_root) {
+		kfree(server->mnt_path);
+		kfree(server->hostname);
+		kfree(server);
+		return simple_set_mnt(mnt, s);
+	}
+
+	s->s_flags = flags;
+
+	error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+	if (error) {
+		up_write(&s->s_umount);
+		deactivate_super(s);
+		return error;
+	}
+	s->s_flags |= MS_ACTIVE;
+	return simple_set_mnt(mnt, s);
+out_err:
+	error = PTR_ERR(p);
+out_free:
+	kfree(server->mnt_path);
+	kfree(server->hostname);
+	kfree(server);
+	return error;
+}
+
+static void nfs4_kill_super(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	nfs_return_all_delegations(sb);
+	kill_anon_super(sb);
+
+	nfs4_renewd_prepare_shutdown(server);
+
+	if (server->client != NULL && !IS_ERR(server->client))
+		rpc_shutdown_client(server->client);
+
+	destroy_nfsv4_state(server);
+
+	rpciod_down();
+
+	nfs_free_iostats(server->io_stats);
+	kfree(server->hostname);
+	kfree(server);
+	nfs_release_automount_timer();
+}
+
+/*
+ * Constructs the SERVER-side path
+ */
+static inline char *nfs4_dup_path(const struct dentry *dentry)
+{
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *path;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (!IS_ERR(path)) {
+		int len = PAGE_SIZE + page - path;
+		char *tmp = path;
+
+		path = kmalloc(len, GFP_KERNEL);
+		if (path)
+			memcpy(path, tmp, len);
+		else
+			path = ERR_PTR(-ENOMEM);
+	}
+	free_page((unsigned long)page);
+	return path;
+}
+
+static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	const struct dentry *dentry = data->dentry;
+	struct nfs4_client *clp = server->nfs4_state;
+	struct super_block *sb;
+
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
+	server->mnt_path = nfs4_dup_path(dentry);
+	if (IS_ERR(server->mnt_path)) {
+		sb = (struct super_block *)server->mnt_path;
+		goto err;
+	}
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	nfs4_server_capabilities(server, &server->fh);
+
+	down_write(&clp->cl_sem);
+	atomic_inc(&clp->cl_count);
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	up_write(&clp->cl_sem);
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt);
+}
+
+static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb = ERR_PTR(-ENOMEM);
+	int len;
+
+	len = strlen(data->mnt_path) + 1;
+	server->mnt_path = kmalloc(len, GFP_KERNEL);
+	if (server->mnt_path == NULL)
+		goto err;
+	memcpy(server->mnt_path, data->mnt_path, len);
+	memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
+
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct rpc_timeout timeparms;
+	int proto, timeo, retrans;
+	void *err;
+
+	proto = IPPROTO_TCP;
+	/* Since we are following a referral and there may be alternatives,
+	   set the timeouts and retries to low values */
+	timeo = 2;
+	retrans = 1;
+	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+
+	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+	if (IS_ERR((err = server->client)))
+		goto out_err;
+
+	sb->s_time_gran = 1;
+	sb->s_op = &nfs4_sops;
+	err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+	if (!IS_ERR(err))
+		return server;
+out_err:
+	return (struct nfs_server *)err;
+}
+
+static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt);
+}
+
+#endif
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 18dc95b0b646..600bbe630abd 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct page *page;
-	void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode));
+	void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
 		goto read_failed;
 	page = read_cache_page(&inode->i_data, 0,
@@ -75,22 +75,13 @@ read_failed:
 	return NULL;
 }
 
-static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
-	if (cookie) {
-		struct page *page = cookie;
-		kunmap(page);
-		page_cache_release(page);
-	}
-}
-
 /*
  * symlinks can't do much...
  */
 struct inode_operations nfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= nfs_follow_link,
-	.put_link	= nfs_put_link,
+	.put_link	= page_put_link,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
 };
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 4c486eb867ca..db61e51bb154 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
 
 #include "callback.h"
 
@@ -46,6 +47,15 @@ static ctl_table nfs_cb_sysctls[] = {
 		.strategy = &sysctl_jiffies,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nfs_mountpoint_timeout",
+		.data		= &nfs_mountpoint_expiry_timeout,
+		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cfada2cc09f..b383fdd3a15c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -98,11 +98,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kzalloc(size, GFP_NOFS);
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
 			if (!p->pagevec) {
 				mempool_free(p, nfs_commit_mempool);
 				p = NULL;
@@ -126,14 +125,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_wdata_mempool);
 				p = NULL;
 			}
@@ -583,6 +579,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
 	return ret;
 }
 
+static void nfs_cancel_requests(struct list_head *head)
+{
+	struct nfs_page *req;
+	while(!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_inode_remove_request(req);
+		nfs_clear_page_writeback(req);
+	}
+}
+
 /*
  * nfs_scan_dirty - Scan an inode for dirty requests
  * @inode: NFS inode to scan
@@ -627,7 +634,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
 	int res = 0;
 
 	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
+		res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
 		nfsi->ncommit -= res;
 		if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -1495,15 +1502,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
 		pages = nfs_scan_dirty(inode, &head, idx_start, npages);
 		if (pages != 0) {
 			spin_unlock(&nfsi->req_lock);
-			ret = nfs_flush_list(inode, &head, pages, how);
+			if (how & FLUSH_INVALIDATE)
+				nfs_cancel_requests(&head);
+			else
+				ret = nfs_flush_list(inode, &head, pages, how);
 			spin_lock(&nfsi->req_lock);
 			continue;
 		}
 		if (nocommit)
 			break;
-		pages = nfs_scan_commit(inode, &head, 0, 0);
+		pages = nfs_scan_commit(inode, &head, idx_start, npages);
 		if (pages == 0)
 			break;
+		if (how & FLUSH_INVALIDATE) {
+			spin_unlock(&nfsi->req_lock);
+			nfs_cancel_requests(&head);
+			spin_lock(&nfsi->req_lock);
+			continue;
+		}
+		pages += nfs_scan_commit(inode, &head, 0, 0);
 		spin_unlock(&nfsi->req_lock);
 		ret = nfs_commit_list(inode, &head, how);
 		spin_lock(&nfsi->req_lock);
@@ -1512,7 +1529,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
 	return ret;
 }
 
-int nfs_init_writepagecache(void)
+int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
@@ -1534,7 +1551,7 @@ int nfs_init_writepagecache(void)
 	return 0;
 }
 
-void nfs_destroy_writepagecache(void)
+void __exit nfs_destroy_writepagecache(void)
 {
 	mempool_destroy(nfs_commit_mempool);
 	mempool_destroy(nfs_wdata_mempool);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index de3998f15f10..5446a0861d1d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1310,7 +1310,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		status = vfs_statfs(dentry->d_inode->i_sb, &statfs);
+		status = vfs_statfs(dentry, &statfs);
 		if (status)
 			goto out_nfserr;
 	}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3ef017b3b5bd..a1810e6a93e5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -494,10 +494,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
-static struct super_block *nfsd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int nfsd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, nfsd_fill_super);
+	return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
 }
 
 static struct file_system_type nfsd_fs_type = {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1d65f13f458c..245eaa1fb59b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1737,7 +1737,7 @@ int
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
 	int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
-	if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat))
+	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
 }
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 3b74e66ca2ff..325ce261a107 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -86,8 +86,7 @@ static inline void ntfs_unmap_page(struct page *page)
 static inline struct page *ntfs_map_page(struct address_space *mapping,
 		unsigned long index)
 {
-	struct page *page = read_cache_page(mapping, index,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1663f5c3c6aa..6708e1d68a9e 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2529,8 +2529,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	end >>= PAGE_CACHE_SHIFT;
 	/* If there is a first partial page, need to do it the slow way. */
 	if (start_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
 					"page (sync error, index 0x%lx).", idx);
@@ -2600,8 +2599,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	}
 	/* If there is a last partial page, need to do it the slow way. */
 	if (end_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
 					"(sync error, index 0x%lx).", idx);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c63a83e8da98..2e42c2dcae12 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -231,8 +231,7 @@ do_non_resident_extend:
 		 * Read the page.  If the page is not present, this will zero
 		 * the uninitialized regions for us.
 		 */
-		page = read_cache_page(mapping, index,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto init_err_out;
@@ -1359,7 +1358,7 @@ err_out:
 	goto out;
 }
 
-static size_t __ntfs_copy_from_user_iovec(char *vaddr,
+static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
 		const struct iovec *iov, size_t iov_ofs, size_t bytes)
 {
 	size_t total = 0;
@@ -1377,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr,
 		bytes -= len;
 		vaddr += len;
 		if (unlikely(left)) {
-			/*
-			 * Zero the rest of the target like __copy_from_user().
-			 */
-			memset(vaddr, 0, bytes);
 			total -= left;
 			break;
 		}
@@ -1421,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
  * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
  * single-segment behaviour.
  *
- * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
- * when not atomic.  This is ok because __ntfs_copy_from_user_iovec() calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep().  And on many
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
+ * when atomic and when not atomic.  This is ok because
+ * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
+ * and it is ok to call this when non-atomic.
+ * Infact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep() and the former
+ * should not zero the tail of the buffer on error.  And on many
  * architectures __copy_from_user_inatomic() is just defined to
  * __copy_from_user() so it makes no difference at all on those architectures.
  */
@@ -1442,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		if (len > bytes)
 			len = bytes;
 		kaddr = kmap_atomic(*pages, KM_USER0);
-		copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+		copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
 				*iov, *iov_ofs, len);
 		kunmap_atomic(kaddr, KM_USER0);
 		if (unlikely(copied != len)) {
 			/* Do it the slow way. */
 			kaddr = kmap(*pages);
-			copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+			copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
 					*iov, *iov_ofs, len);
+			/*
+			 * Zero the rest of the target like __copy_from_user().
+			 */
+			memset(kaddr + ofs + copied, 0, len - copied);
 			kunmap(*pages);
 			if (unlikely(copied != len))
 				goto err_out;
@@ -1484,14 +1485,15 @@ static inline void ntfs_flush_dcache_pages(struct page **pages,
 		unsigned nr_pages)
 {
 	BUG_ON(!nr_pages);
+	/*
+	 * Warning: Do not do the decrement at the same time as the call to
+	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
+	 * decrement never happens so the loop never terminates.
+	 */
 	do {
-		/*
-		 * Warning: Do not do the decrement at the same time as the
-		 * call because flush_dcache_page() is a NULL macro on i386
-		 * and hence the decrement never happens.
-		 */
+		--nr_pages;
 		flush_dcache_page(pages[nr_pages]);
-	} while (--nr_pages > 0);
+	} while (nr_pages > 0);
 }
 
 /**
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 27833f6df49f..0e14acea3f8b 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2601,10 +2601,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 
 /**
  * ntfs_statfs - return information about mounted NTFS volume
- * @sb:		super block of mounted volume
+ * @dentry:	dentry from mounted volume
  * @sfs:	statfs structure in which to return the information
  *
- * Return information about the mounted NTFS volume @sb in the statfs structure
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
  * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
  * called). We interpret the values to be correct of the moment in time at
  * which we are called. Most values are variable otherwise and this isn't just
@@ -2617,8 +2617,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
  *
  * Return 0 on success or -errno on error.
  */
-static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 {
+	struct super_block *sb = dentry->d_sb;
 	s64 size;
 	ntfs_volume *vol = NTFS_SB(sb);
 	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
@@ -3093,10 +3094,11 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
 
-static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ntfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ntfs_fs_type = {
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7e88e24b3471..7273d9fa6bab 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -574,10 +574,10 @@ static struct inode_operations dlmfs_file_inode_operations = {
 	.getattr	= simple_getattr,
 };
 
-static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
 }
 
 static struct file_system_type dlmfs_fs_type = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 949b3dac30f1..cdf73393f094 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -100,7 +100,7 @@ static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
 static void ocfs2_delete_osb(struct ocfs2_super *osb);
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int ocfs2_sync_fs(struct super_block *sb, int wait);
 
@@ -672,12 +672,14 @@ read_super_error:
 	return status;
 }
 
-static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
-					int flags,
-					const char *dev_name,
-					void *data)
+static int ocfs2_get_sb(struct file_system_type *fs_type,
+			int flags,
+			const char *dev_name,
+			void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ocfs2_fs_type = {
@@ -855,7 +857,7 @@ static void ocfs2_put_super(struct super_block *sb)
 	mlog_exit_void();
 }
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ocfs2_super *osb;
 	u32 numbits, freebits;
@@ -864,9 +866,9 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog_entry("(%p, %p)\n", sb, buf);
+	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
 
-	osb = OCFS2_SB(sb);
+	osb = OCFS2_SB(dentry->d_sb);
 
 	inode = ocfs2_get_system_file_inode(osb,
 					    GLOBAL_BITMAP_SYSTEM_INODE,
@@ -889,7 +891,7 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
 
 	buf->f_type = OCFS2_SUPER_MAGIC;
-	buf->f_bsize = sb->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
 	buf->f_blocks = ((sector_t) numbits) *
 			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f6986bd79e75..0c8a1294ec96 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -64,8 +64,7 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/open.c b/fs/open.c
index 317b7c7f38a7..303f06d2a7b9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -31,18 +31,18 @@
 
 #include <asm/unistd.h>
 
-int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval = -ENODEV;
 
-	if (sb) {
+	if (dentry) {
 		retval = -ENOSYS;
-		if (sb->s_op->statfs) {
+		if (dentry->d_sb->s_op->statfs) {
 			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(sb);
+			retval = security_sb_statfs(dentry);
 			if (retval)
 				return retval;
-			retval = sb->s_op->statfs(sb, buf);
+			retval = dentry->d_sb->s_op->statfs(dentry, buf);
 			if (retval == 0 && buf->f_frsize == 0)
 				buf->f_frsize = buf->f_bsize;
 		}
@@ -52,12 +52,12 @@ int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
 
 EXPORT_SYMBOL(vfs_statfs);
 
-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -95,12 +95,12 @@ static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
 	return 0;
 }
 
-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf)
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -130,7 +130,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_native(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -149,7 +149,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs64(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -168,7 +168,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_native(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -189,7 +189,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs64(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -322,7 +322,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = do_truncate(dentry, length, 0, file);
+		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
 	fput(file);
 out:
@@ -633,7 +633,7 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	dentry = file->f_dentry;
 	inode = dentry->d_inode;
 
-	audit_inode(NULL, inode, 0);
+	audit_inode(NULL, inode);
 
 	err = -EROFS;
 	if (IS_RDONLY(inode))
@@ -786,7 +786,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 	if (file) {
 		struct dentry * dentry;
 		dentry = file->f_dentry;
-		audit_inode(NULL, dentry->d_inode, 0);
+		audit_inode(NULL, dentry->d_inode);
 		error = chown_common(dentry, user, group);
 		fput(file);
 	}
@@ -1152,7 +1152,7 @@ int filp_close(struct file *filp, fl_owner_t id)
 	}
 
 	if (filp->f_op && filp->f_op->flush)
-		retval = filp->f_op->flush(filp);
+		retval = filp->f_op->flush(filp, id);
 
 	dnotify_flush(filp, id);
 	locks_remove_posix(filp, id);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 0f14276a2e51..efc7c91128af 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -64,6 +64,11 @@ static int openpromfs_readdir(struct file *, void *, filldir_t);
 static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
 static int openpromfs_unlink (struct inode *, struct dentry *dentry);
 
+static inline u16 ptr_nod(void *p)
+{
+    return (long)p & 0xFFFF;
+}
+
 static ssize_t nodenum_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
@@ -72,7 +77,7 @@ static ssize_t nodenum_read(struct file *file, char __user *buf,
 	
 	if (count < 0 || !inode->u.generic_ip)
 		return -EINVAL;
-	sprintf (buffer, "%8.8x\n", (u32)(long)(inode->u.generic_ip));
+	sprintf (buffer, "%8.8lx\n", (long)inode->u.generic_ip);
 	if (file->f_pos >= 9)
 		return 0;
 	if (count > 9 - file->f_pos)
@@ -95,9 +100,9 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 	char buffer[64];
 	
 	if (!filp->private_data) {
-		node = nodes[(u16)((long)inode->u.generic_ip)].node;
+		node = nodes[ptr_nod(inode->u.generic_ip)].node;
 		i = ((u32)(long)inode->u.generic_ip) >> 16;
-		if ((u16)((long)inode->u.generic_ip) == aliases) {
+		if (ptr_nod(inode->u.generic_ip) == aliases) {
 			if (i >= aliases_nodes)
 				p = NULL;
 			else
@@ -111,7 +116,7 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 			return -EIO;
 		i = prom_getproplen (node, p);
 		if (i < 0) {
-			if ((u16)((long)inode->u.generic_ip) == aliases)
+			if (ptr_nod(inode->u.generic_ip) == aliases)
 				i = 0;
 			else
 				return -EIO;
@@ -123,7 +128,7 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 					      GFP_KERNEL);
 		if (!filp->private_data)
 			return -ENOMEM;
-		op = (openprom_property *)filp->private_data;
+		op = filp->private_data;
 		op->flag = 0;
 		op->alloclen = 2 * i;
 		strcpy (op->name, p);
@@ -163,7 +168,7 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 				op->len--;
 		}
 	} else
-		op = (openprom_property *)filp->private_data;
+		op = filp->private_data;
 	if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
 		return 0;
 	if (*ppos >= 0xffffff || count >= 0xffffff)
@@ -335,7 +340,7 @@ static ssize_t property_write(struct file *filp, const char __user *buf,
 			return i;
 	}
 	k = *ppos;
-	op = (openprom_property *)filp->private_data;
+	op = filp->private_data;
 	if (!(op->flag & OPP_STRING)) {
 		u32 *first, *last;
 		int first_off, last_cnt;
@@ -388,13 +393,13 @@ static ssize_t property_write(struct file *filp, const char __user *buf,
 			memcpy (b, filp->private_data,
 				sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen);
-			memset (((char *)b) + sizeof (openprom_property)
+			memset (b + sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen, 
 				0, 2 * i - op->alloclen);
-			op = (openprom_property *)b;
+			op = b;
 			op->alloclen = 2*i;
 			b = filp->private_data;
-			filp->private_data = (void *)op;
+			filp->private_data = op;
 			kfree (b);
 		}
 		first = ((u32 *)op->value) + (k / 9);
@@ -448,10 +453,11 @@ static ssize_t property_write(struct file *filp, const char __user *buf,
 					*q |= simple_strtoul (tmp, NULL, 16);
 					buf += last_cnt;
 				} else {
-					char tchars[17]; /* XXX yuck... */
+					char tchars[2 * sizeof(long) + 1];
 
-					if (copy_from_user(tchars, buf, 16))
+					if (copy_from_user(tchars, buf, sizeof(tchars) - 1))
 						return -EFAULT;
+                                        tchars[sizeof(tchars) - 1] = '\0';
 					*q = simple_strtoul (tchars, NULL, 16);
 					buf += 9;
 				}
@@ -497,13 +503,13 @@ write_try_string:
 			memcpy (b, filp->private_data,
 				sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen);
-			memset (((char *)b) + sizeof (openprom_property)
+			memset (b + sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen, 
 				0, 2*(count - *ppos) - op->alloclen);
-			op = (openprom_property *)b;
+			op = b;
 			op->alloclen = 2*(count + *ppos);
 			b = filp->private_data;
-			filp->private_data = (void *)op;
+			filp->private_data = op;
 			kfree (b);
 		}
 		p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
@@ -532,15 +538,15 @@ write_try_string:
 
 int property_release (struct inode *inode, struct file *filp)
 {
-	openprom_property *op = (openprom_property *)filp->private_data;
+	openprom_property *op = filp->private_data;
 	int error;
 	u32 node;
 	
 	if (!op)
 		return 0;
 	lock_kernel();
-	node = nodes[(u16)((long)inode->u.generic_ip)].node;
-	if ((u16)((long)inode->u.generic_ip) == aliases) {
+	node = nodes[ptr_nod(inode->u.generic_ip)].node;
+	if (ptr_nod(inode->u.generic_ip) == aliases) {
 		if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) {
 			char *p = op->name;
 			int i = (op->value - op->name) - strlen (op->name) - 1;
@@ -931,7 +937,7 @@ static int __init check_space (u16 n)
 			return -1;
 
 		if (nodes) {
-			memcpy ((char *)pages, (char *)nodes,
+			memcpy ((char *)pages, nodes,
 				(1 << alloced) * PAGE_SIZE);
 			free_pages ((unsigned long)nodes, alloced);
 		}
@@ -1054,10 +1060,10 @@ out_no_root:
 	return -ENOMEM;
 }
 
-static struct super_block *openprom_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int openprom_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, openprom_fill_super);
+	return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
 }
 
 static struct file_system_type openprom_fs_type = {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7ef1f094de91..2ef313a96b66 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -329,6 +329,7 @@ void delete_partition(struct gendisk *disk, int part)
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
 	devfs_remove("%s/part%d", disk->devfs_name, part);
+	sysfs_remove_link(&p->kobj, "subsystem");
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
 	kobject_uevent(&p->kobj, KOBJ_REMOVE);
@@ -363,6 +364,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 	kobject_add(&p->kobj);
 	if (!disk->part_uevent_suppress)
 		kobject_uevent(&p->kobj, KOBJ_ADD);
+	sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem");
 	partition_sysfs_add_subdir(p);
 	disk->part[part-1] = p;
 }
@@ -398,6 +400,7 @@ static void disk_sysfs_symlinks(struct gendisk *disk)
 			kfree(disk_name);
 		}
 	}
+	sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, "subsystem");
 }
 
 /* Not exported, helper to add_disk(). */
@@ -481,6 +484,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		sector_t from = state->parts[p].from;
 		if (!size)
 			continue;
+		if (from + size > get_capacity(disk)) {
+			printk(" %s: p%d exceeds device capacity\n",
+				disk->disk_name, p);
+		}
 		add_partition(disk, p, from, size);
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags)
@@ -496,8 +503,8 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct page *page;
 
-	page = read_cache_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-			(filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+				 NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
@@ -548,5 +555,6 @@ void del_gendisk(struct gendisk *disk)
 		put_device(disk->driverfs_dev);
 		disk->driverfs_dev = NULL;
 	}
+	sysfs_remove_link(&disk->kobj, "subsystem");
 	kobject_del(&disk->kobj);
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 5acd8954aaa0..20352573e025 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -979,12 +979,11 @@ no_files:
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-
-static struct super_block *
-pipefs_get_sb(struct file_system_type *fs_type, int flags,
-	      const char *dev_name, void *data)
+static int pipefs_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data,
+			 struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
 }
 
 static struct file_system_type pipe_fs_type = {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6cc77dc3f3ff..6afff725a8c9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1019,8 +1019,8 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (current != task)
 		return -EPERM;
 
-	if (count > PAGE_SIZE)
-		count = PAGE_SIZE;
+	if (count >= PAGE_SIZE)
+		count = PAGE_SIZE - 1;
 
 	if (*ppos != 0) {
 		/* No partial writes. */
@@ -1033,6 +1033,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (copy_from_user(page, buf, count))
 		goto out_free_page;
 
+	page[count] = '\0';
 	loginuid = simple_strtoul(page, &tmp, 10);
 	if (tmp == page) {
 		length = -EINVAL;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c3fd3611112f..9995356ce73e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -26,10 +26,10 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc
 struct proc_dir_entry *proc_sys_root;
 #endif
 
-static struct super_block *proc_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int proc_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, proc_fill_super);
+	return get_sb_single(fs_type, flags, data, proc_fill_super, mnt);
 }
 
 static struct file_system_type proc_fs_type = {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2ecd46f85e9f..2f24c46f72a1 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -128,7 +128,7 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static void qnx4_read_inode(struct inode *);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
-static int qnx4_statfs(struct super_block *, struct kstatfs *);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
 
 static struct super_operations qnx4_sops =
 {
@@ -282,8 +282,10 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 	return block;
 }
 
-static int qnx4_statfs(struct super_block *sb, struct kstatfs *buf)
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	lock_kernel();
 
 	buf->f_type    = sb->s_magic;
@@ -561,10 +563,11 @@ static void destroy_inodecache(void)
 		       "qnx4_inode_cache: not all structures were freed\n");
 }
 
-static struct super_block *qnx4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int qnx4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+			   mnt);
 }
 
 static struct file_system_type qnx4_fs_type = {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 14bd2246fb6d..b9677335cc8d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -185,16 +185,17 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	return 0;
 }
 
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int ramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
 }
 
-static struct super_block *rootfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int rootfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+			    mnt);
 }
 
 static struct file_system_type ramfs_fs_type = {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cae2abbc0c71..00f1321e9209 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -60,7 +60,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 }
 
 static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
@@ -1938,15 +1938,15 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return errval;
 }
 
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
 
 	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
 	buf->f_bfree = sb_free_blocks(rs);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-	buf->f_bsize = s->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	/* changed to accommodate gcc folks. */
 	buf->f_type = REISERFS_SUPER_MAGIC;
 	return 0;
@@ -2249,11 +2249,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 
 #endif
 
-static struct super_block *get_super_block(struct file_system_type *fs_type,
-					   int flags, const char *dev_name,
-					   void *data)
+static int get_super_block(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+			   mnt);
 }
 
 static int __init init_reiserfs_fs(void)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ffb79c48c5bf..39fedaa88a0c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -452,8 +452,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 	/* We can deadlock if we try to free dentries,
 	   and an unlink/rmdir has just occured - GFP_NOFS avoids this */
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_cache_page(mapping, n,
-			       (filler_t *) mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 9b9eda7b335c..283fbc6b8eea 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -179,12 +179,12 @@ outnobh:
 /* That's simple too. */
 
 static int
-romfs_statfs(struct super_block *sb, struct kstatfs *buf)
+romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	buf->f_type = ROMFS_MAGIC;
 	buf->f_bsize = ROMBSIZE;
 	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(sb)+ROMBSIZE-1)>>ROMBSBITS;
+	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
 	buf->f_namelen = ROMFS_MAXFN;
 	return 0;
 }
@@ -607,10 +607,11 @@ static struct super_operations romfs_ops = {
 	.remount_fs	= romfs_remount,
 };
 
-static struct super_block *romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int romfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type romfs_fs_type = {
diff --git a/fs/select.c b/fs/select.c
index a8109baa5e46..33b72ba0f86f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -546,37 +546,38 @@ struct poll_list {
 
 #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
 
-static void do_pollfd(unsigned int num, struct pollfd * fdpage,
-	poll_table ** pwait, int *count)
+/*
+ * Fish for pollable events on the pollfd->fd file descriptor. We're only
+ * interested in events matching the pollfd->events mask, and the result
+ * matching that mask is both recorded in pollfd->revents and returned. The
+ * pwait poll_table will be used by the fd-provided poll handler for waiting,
+ * if non-NULL.
+ */
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 {
-	int i;
-
-	for (i = 0; i < num; i++) {
-		int fd;
-		unsigned int mask;
-		struct pollfd *fdp;
-
-		mask = 0;
-		fdp = fdpage+i;
-		fd = fdp->fd;
-		if (fd >= 0) {
-			int fput_needed;
-			struct file * file = fget_light(fd, &fput_needed);
-			mask = POLLNVAL;
-			if (file != NULL) {
-				mask = DEFAULT_POLLMASK;
-				if (file->f_op && file->f_op->poll)
-					mask = file->f_op->poll(file, *pwait);
-				mask &= fdp->events | POLLERR | POLLHUP;
-				fput_light(file, fput_needed);
-			}
-			if (mask) {
-				*pwait = NULL;
-				(*count)++;
-			}
+	unsigned int mask;
+	int fd;
+
+	mask = 0;
+	fd = pollfd->fd;
+	if (fd >= 0) {
+		int fput_needed;
+		struct file * file;
+
+		file = fget_light(fd, &fput_needed);
+		mask = POLLNVAL;
+		if (file != NULL) {
+			mask = DEFAULT_POLLMASK;
+			if (file->f_op && file->f_op->poll)
+				mask = file->f_op->poll(file, pwait);
+			/* Mask out unneeded events. */
+			mask &= pollfd->events | POLLERR | POLLHUP;
+			fput_light(file, fput_needed);
 		}
-		fdp->revents = mask;
 	}
+	pollfd->revents = mask;
+
+	return mask;
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
@@ -594,11 +595,29 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		walk = list;
-		while(walk != NULL) {
-			do_pollfd( walk->len, walk->entries, &pt, &count);
-			walk = walk->next;
+		for (walk = list; walk != NULL; walk = walk->next) {
+			struct pollfd * pfd, * pfd_end;
+
+			pfd = walk->entries;
+			pfd_end = pfd + walk->len;
+			for (; pfd != pfd_end; pfd++) {
+				/*
+				 * Fish for events. If we found one, record it
+				 * and kill the poll_table, so we don't
+				 * needlessly register any other waiters after
+				 * this. They'll get immediately deregistered
+				 * when we break out and return.
+				 */
+				if (do_pollfd(pfd, pt)) {
+					count++;
+					pt = NULL;
+				}
+			}
 		}
+		/*
+		 * All waiters have already been registered, so don't provide
+		 * a poll_table to them on the next loop iteration.
+		 */
 		pt = NULL;
 		if (count || !*timeout || signal_pending(current))
 			break;
@@ -727,9 +746,9 @@ out_fds:
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 			long timeout_msecs)
 {
-	s64 timeout_jiffies = 0;
+	s64 timeout_jiffies;
 
-	if (timeout_msecs) {
+	if (timeout_msecs > 0) {
 #if HZ > 1000
 		/* We can only overflow if HZ > 1000 */
 		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
@@ -737,6 +756,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		else
 #endif
 			timeout_jiffies = msecs_to_jiffies(timeout_msecs);
+	} else {
+		/* Infinite (< 0) or no (0) timeout */
+		timeout_jiffies = timeout_msecs;
 	}
 
 	return do_sys_poll(ufds, nfds, &timeout_jiffies);
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fdeabc0a34f7..506ff87c1d4b 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -48,7 +48,7 @@
 
 static void smb_delete_inode(struct inode *);
 static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct super_block *, struct kstatfs *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
 
 static kmem_cache_t *smb_inode_cachep;
@@ -641,13 +641,13 @@ out_no_server:
 }
 
 static int
-smb_statfs(struct super_block *sb, struct kstatfs *buf)
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int result;
 	
 	lock_kernel();
 
-	result = smb_proc_dskattr(sb, buf);
+	result = smb_proc_dskattr(dentry, buf);
 
 	unlock_kernel();
 
@@ -782,10 +782,10 @@ out:
 	return error;
 }
 
-static struct super_block *smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int smb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super);
+	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
 }
 
 static struct file_system_type smb_fs_type = {
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index b1b878b81730..c3495059889d 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3226,9 +3226,9 @@ smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
 }
 
 int
-smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr)
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
 {
-	struct smb_sb_info *server = SMB_SB(sb);
+	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
 	int result;
 	char *p;
 	long unit;
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 47664597e6b1..972ed7dad388 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -29,7 +29,7 @@ extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
 extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
 extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
 extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 481a97a423fa..3f71384020cb 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/net.h>
+#include <linux/kthread.h>
 #include <net/ip.h>
 
 #include <linux/smb_fs.h>
@@ -40,7 +41,7 @@ enum smbiod_state {
 };
 
 static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static pid_t smbiod_pid;
+static struct task_struct *smbiod_thread;
 static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
 static LIST_HEAD(smb_servers);
 static DEFINE_SPINLOCK(servers_lock);
@@ -67,20 +68,29 @@ void smbiod_wake_up(void)
  */
 static int smbiod_start(void)
 {
-	pid_t pid;
+	struct task_struct *tsk;
+	int err = 0;
+
 	if (smbiod_state != SMBIOD_DEAD)
 		return 0;
 	smbiod_state = SMBIOD_STARTING;
 	__module_get(THIS_MODULE);
 	spin_unlock(&servers_lock);
-	pid = kernel_thread(smbiod, NULL, 0);
-	if (pid < 0)
+	tsk = kthread_run(smbiod, NULL, "smbiod");
+	if (IS_ERR(tsk)) {
+		err = PTR_ERR(tsk);
 		module_put(THIS_MODULE);
+	}
 
 	spin_lock(&servers_lock);
-	smbiod_state = pid < 0 ? SMBIOD_DEAD : SMBIOD_RUNNING;
-	smbiod_pid = pid;
-	return pid;
+	if (err < 0) {
+		smbiod_state = SMBIOD_DEAD;
+		smbiod_thread = NULL;
+	} else {
+		smbiod_state = SMBIOD_RUNNING;
+		smbiod_thread = tsk;
+	}
+	return err;
 }
 
 /*
@@ -290,8 +300,6 @@ out:
  */
 static int smbiod(void *unused)
 {
-	daemonize("smbiod");
-
 	allow_signal(SIGKILL);
 
 	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
diff --git a/fs/splice.c b/fs/splice.c
index a285fd746dc0..05fd2787be98 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -55,31 +55,43 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 				     struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
-	struct address_space *mapping = page_mapping(page);
+	struct address_space *mapping;
 
 	lock_page(page);
 
-	WARN_ON(!PageUptodate(page));
+	mapping = page_mapping(page);
+	if (mapping) {
+		WARN_ON(!PageUptodate(page));
 
-	/*
-	 * At least for ext2 with nobh option, we need to wait on writeback
-	 * completing on this page, since we'll remove it from the pagecache.
-	 * Otherwise truncate wont wait on the page, allowing the disk
-	 * blocks to be reused by someone else before we actually wrote our
-	 * data to them. fs corruption ensues.
-	 */
-	wait_on_page_writeback(page);
+		/*
+		 * At least for ext2 with nobh option, we need to wait on
+		 * writeback completing on this page, since we'll remove it
+		 * from the pagecache.  Otherwise truncate wont wait on the
+		 * page, allowing the disk blocks to be reused by someone else
+		 * before we actually wrote our data to them. fs corruption
+		 * ensues.
+		 */
+		wait_on_page_writeback(page);
 
-	if (PagePrivate(page))
-		try_to_release_page(page, mapping_gfp_mask(mapping));
+		if (PagePrivate(page))
+			try_to_release_page(page, mapping_gfp_mask(mapping));
 
-	if (!remove_mapping(mapping, page)) {
-		unlock_page(page);
-		return 1;
+		/*
+		 * If we succeeded in removing the mapping, set LRU flag
+		 * and return good.
+		 */
+		if (remove_mapping(mapping, page)) {
+			buf->flags |= PIPE_BUF_FLAG_LRU;
+			return 0;
+		}
 	}
 
-	buf->flags |= PIPE_BUF_FLAG_LRU;
-	return 0;
+	/*
+	 * Raced with truncate or failed to remove page from current
+	 * address space, unlock and return failure.
+	 */
+	unlock_page(page);
+	return 1;
 }
 
 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
diff --git a/fs/super.c b/fs/super.c
index a66f66bb8049..8a669f6f3f52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(&sb->s_anon);
+		shrink_dcache_sb(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
@@ -486,7 +486,7 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
         s = user_get_super(new_decode_dev(dev));
         if (s == NULL)
                 goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
@@ -676,9 +676,10 @@ static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
 	}
 }
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -686,7 +687,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 	bdev = open_bdev_excl(dev_name, flags, fs_type);
 	if (IS_ERR(bdev))
-		return (struct super_block *)bdev;
+		return PTR_ERR(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -697,15 +698,17 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
 	mutex_unlock(&bdev->bd_mount_mutex);
 	if (IS_ERR(s))
-		goto out;
+		goto error_s;
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(-EBUSY);
+			error = -EBUSY;
+			goto error_bdev;
 		}
-		goto out;
+
+		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -716,18 +719,21 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(error);
-		} else {
-			s->s_flags |= MS_ACTIVE;
-			bdev_uevent(bdev, KOBJ_MOUNT);
+			goto error;
 		}
+
+		s->s_flags |= MS_ACTIVE;
+		bdev_uevent(bdev, KOBJ_MOUNT);
 	}
 
-	return s;
+	return simple_set_mnt(mnt, s);
 
-out:
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
 	close_bdev_excl(bdev);
-	return s;
+error:
+	return error;
 }
 
 EXPORT_SYMBOL(get_sb_bdev);
@@ -744,15 +750,16 @@ void kill_block_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_block_super);
 
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	int error;
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = flags;
 
@@ -760,10 +767,10 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -773,94 +780,100 @@ static int compare_single(struct super_block *s, void *p)
 	return 1;
 }
 
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct super_block *s;
 	int error;
 
 	s = sget(fs_type, compare_single, set_anon_super, NULL);
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			return ERR_PTR(error);
+			return error;
 		}
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
-	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
-	int error;
 	char *secdata = NULL;
+	int error;
 
 	if (!type)
 		return ERR_PTR(-ENODEV);
 
+	error = -ENOMEM;
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
 
 	if (data) {
 		secdata = alloc_secdata();
-		if (!secdata) {
-			sb = ERR_PTR(-ENOMEM);
+		if (!secdata)
 			goto out_mnt;
-		}
 
 		error = security_sb_copy_data(type, data, secdata);
-		if (error) {
-			sb = ERR_PTR(error);
+		if (error)
 			goto out_free_secdata;
-		}
 	}
 
-	sb = type->get_sb(type, flags, name, data);
-	if (IS_ERR(sb))
+	error = type->get_sb(type, flags, name, data, mnt);
+	if (error < 0)
 		goto out_free_secdata;
- 	error = security_sb_kern_mount(sb, secdata);
+
+ 	error = security_sb_kern_mount(mnt->mnt_sb, secdata);
  	if (error)
  		goto out_sb;
-	mnt->mnt_sb = sb;
-	mnt->mnt_root = dget(sb->s_root);
-	mnt->mnt_mountpoint = sb->s_root;
+
+	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
-	up_write(&sb->s_umount);
+	up_write(&mnt->mnt_sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
-	sb = ERR_PTR(error);
+	dput(mnt->mnt_root);
+	up_write(&mnt->mnt_sb->s_umount);
+	deactivate_super(mnt->mnt_sb);
 out_free_secdata:
 	free_secdata(secdata);
 out_mnt:
 	free_vfsmnt(mnt);
 out:
-	put_filesystem(type);
-	return (struct vfsmount *)sb;
+	return ERR_PTR(error);
 }
 
-EXPORT_SYMBOL_GPL(do_kern_mount);
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	put_filesystem(type);
+	return mnt;
+}
 
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
diff --git a/fs/sync.c b/fs/sync.c
index aab5ffe77e9f..955aef04da28 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -100,7 +100,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 	}
 
 	if (nbytes == 0)
-		endbyte = -1;
+		endbyte = LLONG_MAX;
 	else
 		endbyte--;		/* inclusive */
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f1117e885bd6..40190c489271 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -66,10 +66,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *sysfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sysfs_fill_super);
+	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
 }
 
 static struct file_system_type sysfs_fs_type = {
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index d7074341ee87..f2bef962d309 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -53,8 +53,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3ff89cc5833a..58b2d22142ba 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -85,8 +85,9 @@ static void sysv_put_super(struct super_block *sb)
 	kfree(sbi);
 }
 
-static int sysv_statfs(struct super_block *sb, struct kstatfs *buf)
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
 	buf->f_type = sb->s_magic;
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index e92b991e6dda..876639b93321 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -506,16 +506,17 @@ failed:
 
 /* Every kernel module contains stuff like this. */
 
-static struct super_block *sysv_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysv_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+			   mnt);
 }
 
-static struct super_block *v7_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int v7_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
 }
 
 static struct file_system_type sysv_fs_type = {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e45789fe38e8..44fe2cb0bbb2 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -91,13 +91,13 @@ static void udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
-static int udf_statfs(struct super_block *, struct kstatfs *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
 
 /* UDF filesystem type */
-static struct super_block *udf_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int udf_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
 }
 
 static struct file_system_type udf_fstype = {
@@ -1779,8 +1779,10 @@ udf_put_super(struct super_block *sb)
  *	Written, tested, and released.
  */
 static int
-udf_statfs(struct super_block *sb, struct kstatfs *buf)
+udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = UDF_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = UDF_SB_PARTLEN(sb, UDF_SB_PARTITION(sb));
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 3ada9dcf55b8..95b878e5c7a0 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -21,14 +21,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_BALLOC_DEBUG
-
-#ifdef UFS_BALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *);
@@ -39,7 +31,8 @@ static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *,
 /*
  * Free 'count' fragments from fragment number 'fragment'
  */
-void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
+{
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -51,7 +44,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
-	UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+	UFSD("ENTER, fragment %u, count %u\n", fragment, count);
 	
 	if (ufs_fragnum(fragment) + count > uspi->s_fpg)
 		ufs_error (sb, "ufs_free_fragments", "internal error");
@@ -68,7 +61,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi) 
 		goto failed;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
 		goto failed;
@@ -76,11 +69,11 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
 
 	end_bit = bit + count;
 	bbase = ufs_blknum (bit);
-	blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
 	for (i = bit; i < end_bit; i++) {
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i))
-			ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i);
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
 		else 
 			ufs_error (sb, "ufs_free_fragments",
 				   "bit already cleared for fragment %u", i);
@@ -90,51 +83,52 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
 
 	
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
-	fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree += count;
 	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-	blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
 
 	/*
 	 * Trying to reassemble free fragments into block
 	 */
 	blkno = ufs_fragstoblks (bbase);
-	if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 		fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
-		fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb);
+		uspi->cs_total.cs_nffree -= uspi->s_fpb;
 		fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
 		cylno = ufs_cbtocylno (bbase);
 		fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1);
 		fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	}
 	
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 	
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 failed:
 	unlock_super (sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return;
 }
 
 /*
  * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
  */
-void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
+{
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -146,7 +140,7 @@ void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 
-	UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+	UFSD("ENTER, fragment %u, count %u\n", fragment, count);
 	
 	if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
 		ufs_error (sb, "ufs_free_blocks", "internal error, "
@@ -162,7 +156,7 @@ do_more:
 	bit = ufs_dtogd (fragment);
 	if (cgno >= uspi->s_ncg) {
 		ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device");
-		goto failed;
+		goto failed_unlock;
 	}
 	end_bit = bit + count;
 	if (end_bit > uspi->s_fpg) {
@@ -173,36 +167,36 @@ do_more:
 
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi) 
-		goto failed;
-	ucg = ubh_get_ucg (UCPI_UBH);
+		goto failed_unlock;
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
-		goto failed;
+		goto failed_unlock;
 	}
 
 	for (i = bit; i < end_bit; i += uspi->s_fpb) {
 		blkno = ufs_fragstoblks(i);
-		if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+		if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 			ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
 		}
-		ubh_setblock(UCPI_UBH, ucpi->c_freeoff, blkno);
+		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
 		DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
 		cylno = ufs_cbtocylno(i);
 		fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1);
 		fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 
 	if (overflow) {
@@ -213,38 +207,127 @@ do_more:
 
 	sb->s_dirt = 1;
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
-failed:
+failed_unlock:
 	unlock_super (sb);
-	UFSD(("EXIT (FAILED)\n"))
+failed:
+	UFSD("EXIT (FAILED)\n");
 	return;
 }
 
+static struct page *ufs_get_locked_page(struct address_space *mapping,
+				  unsigned long index)
+{
+	struct page *page;
+
+try_again:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		page = read_cache_page(mapping, index,
+				       (filler_t*)mapping->a_ops->readpage,
+				       NULL);
+		if (IS_ERR(page)) {
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "read_cache_page error: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+			goto out;
+		}
 
+		lock_page(page);
 
-#define NULLIFY_FRAGMENTS \
-	for (i = oldcount; i < newcount; i++) { \
-		bh = sb_getblk(sb, result + i); \
-		memset (bh->b_data, 0, sb->s_blocksize); \
-		set_buffer_uptodate(bh); \
-		mark_buffer_dirty (bh); \
-		if (IS_SYNC(inode)) \
-			sync_dirty_buffer(bh); \
-		brelse (bh); \
+		if (!PageUptodate(page) || PageError(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "can not read page: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+
+			page = ERR_PTR(-EIO);
+			goto out;
+		}
 	}
 
-unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
-	unsigned goal, unsigned count, int * err )
+	if (unlikely(!page->mapping || !page_has_buffers(page))) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto try_again;/*we really need these buffers*/
+	}
+out:
+	return page;
+}
+
+/*
+ * Modify inode page cache in such way:
+ * have - blocks with b_blocknr equal to oldb...oldb+count-1
+ * get - blocks with b_blocknr equal to newb...newb+count-1
+ * also we suppose that oldb...oldb+count-1 blocks
+ * situated at the end of file.
+ *
+ * We can come here from ufs_writepage or ufs_prepare_write,
+ * locked_page is argument of these functions, so we already lock it.
+ */
+static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
+			       unsigned int count, unsigned int oldb,
+			       unsigned int newb, struct page *locked_page)
+{
+	unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index, cur_index = locked_page->index;
+	unsigned int i, j;
+	struct page *page;
+	struct buffer_head *head, *bh;
+
+	UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+	      inode->i_ino, count, oldb, newb);
+
+	BUG_ON(!PageLocked(locked_page));
+
+	for (i = 0; i < count; i += blk_per_page) {
+		index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+		if (likely(cur_index != index)) {
+			page = ufs_get_locked_page(mapping, index);
+			if (IS_ERR(page))
+				continue;
+		} else
+			page = locked_page;
+
+		j = i;
+		head = page_buffers(page);
+		bh = head;
+		do {
+			if (likely(bh->b_blocknr == j + oldb && j < count)) {
+				unmap_underlying_metadata(bh->b_bdev,
+							  bh->b_blocknr);
+				bh->b_blocknr = newb + j++;
+				mark_buffer_dirty(bh);
+			}
+
+			bh = bh->b_this_page;
+		} while (bh != head);
+
+		set_page_dirty(page);
+
+		if (likely(cur_index != index)) {
+			unlock_page(page);
+			page_cache_release(page);
+		}
+ 	}
+	UFSD("EXIT\n");
+}
+
+unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
+			   unsigned goal, unsigned count, int * err, struct page *locked_page)
 {
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
-	struct buffer_head * bh;
-	unsigned cgno, oldcount, newcount, tmp, request, i, result;
+	unsigned cgno, oldcount, newcount, tmp, request, result;
 	
-	UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count))
+	UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -273,14 +356,14 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 			return (unsigned)-1;
 		}
 		if (fragment < UFS_I(inode)->i_lastfrag) {
-			UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
 			unlock_super (sb);
 			return 0;
 		}
 	}
 	else {
 		if (tmp) {
-			UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
 			unlock_super(sb);
 			return 0;
 		}
@@ -289,9 +372,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	/*
 	 * There is not enough space for user on the device
 	 */
-	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) {
+	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
 		unlock_super (sb);
-		UFSD(("EXIT (FAILED)\n"))
+		UFSD("EXIT (FAILED)\n");
 		return 0;
 	}
 
@@ -310,12 +393,10 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 		if (result) {
 			*p = cpu_to_fs32(sb, result);
 			*err = 0;
-			inode->i_blocks += count << uspi->s_nspfshift;
 			UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-			NULLIFY_FRAGMENTS
 		}
 		unlock_super(sb);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
@@ -325,11 +406,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
 	if (result) {
 		*err = 0;
-		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-		NULLIFY_FRAGMENTS
 		unlock_super(sb);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
@@ -339,8 +418,8 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	switch (fs32_to_cpu(sb, usb1->fs_optim)) {
 	    case UFS_OPTSPACE:
 		request = newcount;
-		if (uspi->s_minfree < 5 || fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) 
-		    > uspi->s_dsize * uspi->s_minfree / (2 * 100) )
+		if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
+		    > uspi->s_dsize * uspi->s_minfree / (2 * 100))
 			break;
 		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
 		break;
@@ -349,7 +428,7 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	
 	    case UFS_OPTTIME:
 		request = uspi->s_fpb;
-		if (fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) < uspi->s_dsize *
+		if (uspi->cs_total.cs_nffree < uspi->s_dsize *
 		    (uspi->s_minfree - 2) / 100)
 			break;
 		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
@@ -357,39 +436,22 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	}
 	result = ufs_alloc_fragments (inode, cgno, goal, request, err);
 	if (result) {
-		for (i = 0; i < oldcount; i++) {
-			bh = sb_bread(sb, tmp + i);
-			if(bh)
-			{
-				clear_buffer_dirty(bh);
-				bh->b_blocknr = result + i;
-				mark_buffer_dirty (bh);
-				if (IS_SYNC(inode))
-					sync_dirty_buffer(bh);
-				brelse (bh);
-			}
-			else
-			{
-				printk(KERN_ERR "ufs_new_fragments: bread fail\n");
-				unlock_super(sb);
-				return 0;
-			}
-		}
+		ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp,
+				   result, locked_page);
+
 		*p = cpu_to_fs32(sb, result);
 		*err = 0;
-		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-		NULLIFY_FRAGMENTS
 		unlock_super(sb);
 		if (newcount < request)
 			ufs_free_fragments (inode, result + newcount, request - newcount);
 		ufs_free_fragments (inode, tmp, oldcount);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
 	unlock_super(sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 }		
 
@@ -404,7 +466,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	struct ufs_cylinder_group * ucg;
 	unsigned cgno, fragno, fragoff, count, fragsize, i;
 	
-	UFSD(("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount))
+	UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -419,7 +481,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi)
 		return 0;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_add_fragments",
 			"internal error, bad magic number on cg %u", cgno);
@@ -429,14 +491,14 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	fragno = ufs_dtogd (fragment);
 	fragoff = ufs_fragnum (fragno);
 	for (i = oldcount; i < newcount; i++)
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
 			return 0;
 	/*
 	 * Block can be extended
 	 */
 	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
 	for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
 			break;
 	fragsize = i - oldcount;
 	if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
@@ -446,7 +508,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	if (fragsize != count)
 		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
 	for (i = oldcount; i < newcount; i++)
-		ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, fragno + i);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
 	if(DQUOT_ALLOC_BLOCK(inode, count)) {
 		*err = -EDQUOT;
 		return 0;
@@ -454,17 +516,17 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
 	
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
-	UFSD(("EXIT, fragment %u\n", fragment))
+	UFSD("EXIT, fragment %u\n", fragment);
 	
 	return fragment;
 }
@@ -487,7 +549,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
 	struct ufs_cylinder_group * ucg;
 	unsigned oldcg, i, j, k, result, allocsize;
 	
-	UFSD(("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count))
+	UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -521,14 +583,14 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
 		UFS_TEST_FREE_SPACE_CG
 	}
 	
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 
 cg_found:
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi)
 		return 0;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) 
 		ufs_panic (sb, "ufs_alloc_fragments",
 			"internal error, bad magic number on cg %u", cgno);
@@ -551,12 +613,12 @@ cg_found:
 			return 0;
 		goal = ufs_dtogd (result);
 		for (i = count; i < uspi->s_fpb; i++)
-			ubh_setbit (UCPI_UBH, ucpi->c_freeoff, goal + i);
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
 		i = uspi->s_fpb - count;
 		DQUOT_FREE_BLOCK(inode, i);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nffree, i);
+		uspi->cs_total.cs_nffree += i;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
 		fs32_add(sb, &ucg->cg_frsum[i], 1);
 		goto succed;
@@ -570,10 +632,10 @@ cg_found:
 		return 0;
 	}
 	for (i = 0; i < count; i++)
-		ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, result + i);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
 	fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
 
@@ -581,16 +643,16 @@ cg_found:
 		fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
 
 succed:
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
 	result += cgno * uspi->s_fpg;
-	UFSD(("EXIT3, result %u\n", result))
+	UFSD("EXIT3, result %u\n", result);
 	return result;
 }
 
@@ -603,12 +665,12 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	struct ufs_cylinder_group * ucg;
 	unsigned result, cylno, blkno;
 
-	UFSD(("ENTER, goal %u\n", goal))
+	UFSD("ENTER, goal %u\n", goal);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (goal == 0) {
 		goal = ucpi->c_rotor;
@@ -620,7 +682,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	/*
 	 * If the requested block is available, use it.
 	 */
-	if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
 		result = goal;
 		goto gotit;
 	}
@@ -632,7 +694,7 @@ norot:
 	ucpi->c_rotor = result;
 gotit:
 	blkno = ufs_fragstoblks(result);
-	ubh_clrblock (UCPI_UBH, ucpi->c_freeoff, blkno);
+	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 		ufs_clusteracct (sb, ucpi, blkno, -1);
 	if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
@@ -641,31 +703,76 @@ gotit:
 	}
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+	uspi->cs_total.cs_nbfree--;
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
 	cylno = ufs_cbtocylno(result);
 	fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
 	fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	
-	UFSD(("EXIT, result %u\n", result))
+	UFSD("EXIT, result %u\n", result);
 
 	return result;
 }
 
-static unsigned ufs_bitmap_search (struct super_block * sb,
-	struct ufs_cg_private_info * ucpi, unsigned goal, unsigned count)
+static unsigned ubh_scanc(struct ufs_sb_private_info *uspi,
+			  struct ufs_buffer_head *ubh,
+			  unsigned begin, unsigned size,
+			  unsigned char *table, unsigned char mask)
 {
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block_first * usb1;
-	struct ufs_cylinder_group * ucg;
-	unsigned start, length, location, result;
-	unsigned possition, fragsize, blockmap, mask;
-	
-	UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count))
+	unsigned rest, offset;
+	unsigned char *cp;
+	
+
+	offset = begin & ~uspi->s_fmask;
+	begin >>= uspi->s_fshift;
+	for (;;) {
+		if ((offset + size) < uspi->s_fsize)
+			rest = size;
+		else
+			rest = uspi->s_fsize - offset;
+		size -= rest;
+		cp = ubh->bh[begin]->b_data + offset;
+		while ((table[*cp++] & mask) == 0 && --rest)
+			;
+		if (rest || !size)
+			break;
+		begin++;
+		offset = 0;
+	}
+	return (size + rest);
+}
+
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ * @sp: pointer to super block
+ * @ucpi: pointer to cylinder group info
+ * @goal: near which block we want find new one
+ * @count: specified size
+ */
+static unsigned ufs_bitmap_search(struct super_block *sb,
+				  struct ufs_cg_private_info *ucpi,
+				  unsigned goal, unsigned count)
+{
+	/*
+	 * Bit patterns for identifying fragments in the block map
+	 * used as ((map & mask_arr) == want_arr)
+	 */
+	static const int mask_arr[9] = {
+		0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+	};
+	static const int want_arr[9] = {
+		0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+	};
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_cylinder_group *ucg;
+	unsigned start, length, loc, result;
+	unsigned pos, want, blockmap, mask, end;
+
+	UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count);
 
-	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first (uspi);
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (goal)
 		start = ufs_dtogd(goal) >> 3;
@@ -673,53 +780,50 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 		start = ucpi->c_frotor >> 3;
 		
 	length = ((uspi->s_fpg + 7) >> 3) - start;
-	location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length,
+	loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
 		(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
 		1 << (count - 1 + (uspi->s_fpb & 7))); 
-	if (location == 0) {
+	if (loc == 0) {
 		length = start + 1;
-		location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff, length, 
-			(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
-			1 << (count - 1 + (uspi->s_fpb & 7)));
-		if (location == 0) {
-			ufs_error (sb, "ufs_bitmap_search",
-			"bitmap corrupted on cg %u, start %u, length %u, count %u, freeoff %u\n",
-			ucpi->c_cgx, start, length, count, ucpi->c_freeoff);
+		loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length,
+				(uspi->s_fpb == 8) ? ufs_fragtable_8fpb :
+				ufs_fragtable_other,
+				1 << (count - 1 + (uspi->s_fpb & 7)));
+		if (loc == 0) {
+			ufs_error(sb, "ufs_bitmap_search",
+				  "bitmap corrupted on cg %u, start %u,"
+				  " length %u, count %u, freeoff %u\n",
+				  ucpi->c_cgx, start, length, count,
+				  ucpi->c_freeoff);
 			return (unsigned)-1;
 		}
 		start = 0;
 	}
-	result = (start + length - location) << 3;
+	result = (start + length - loc) << 3;
 	ucpi->c_frotor = result;
 
 	/*
 	 * found the byte in the map
 	 */
-	blockmap = ubh_blkmap(UCPI_UBH, ucpi->c_freeoff, result);
-	fragsize = 0;
-	for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) {
-		if (blockmap & mask) {
-			if (!(possition & uspi->s_fpbmask))
-				fragsize = 1;
-			else 
-				fragsize++;
-		}
-		else {
-			if (fragsize == count) {
-				result += possition - count;
-				UFSD(("EXIT, result %u\n", result))
-				return result;
-			}
-			fragsize = 0;
-		}
-	}
-	if (fragsize == count) {
-		result += possition - count;
-		UFSD(("EXIT, result %u\n", result))
-		return result;
-	}
-	ufs_error (sb, "ufs_bitmap_search", "block not in map on cg %u\n", ucpi->c_cgx);
-	UFSD(("EXIT (FAILED)\n"))
+
+	for (end = result + 8; result < end; result += uspi->s_fpb) {
+		blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
+		blockmap <<= 1;
+		mask = mask_arr[count];
+		want = want_arr[count];
+		for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
+			if ((blockmap & mask) == want) {
+				UFSD("EXIT, result %u\n", result);
+				return result + pos;
+ 			}
+			mask <<= 1;
+			want <<= 1;
+ 		}
+ 	}
+
+	ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
+		  ucpi->c_cgx);
+	UFSD("EXIT (FAILED)\n");
 	return (unsigned)-1;
 }
 
@@ -734,9 +838,9 @@ static void ufs_clusteracct(struct super_block * sb,
 		return;
 
 	if (cnt > 0)
-		ubh_setbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+		ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
 	else
-		ubh_clrbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+		ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
 
 	/*
 	 * Find the size of the cluster going forward.
@@ -745,7 +849,7 @@ static void ufs_clusteracct(struct super_block * sb,
 	end = start + uspi->s_contigsumsize;
 	if ( end >= ucpi->c_nclusterblks)
 		end = ucpi->c_nclusterblks;
-	i = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_clusteroff, end, start);
+	i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
 	if (i > end)
 		i = end;
 	forw = i - start;
@@ -757,7 +861,7 @@ static void ufs_clusteracct(struct super_block * sb,
 	end = start - uspi->s_contigsumsize;
 	if (end < 0 ) 
 		end = -1;
-	i = ubh_find_last_zero_bit (UCPI_UBH, ucpi->c_clusteroff, start, end);
+	i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
 	if ( i < end) 
 		i = end;
 	back = start - i;
@@ -769,11 +873,11 @@ static void ufs_clusteracct(struct super_block * sb,
 	i = back + forw + 1;
 	if (i > uspi->s_contigsumsize)
 		i = uspi->s_contigsumsize;
-	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (i << 2)), cnt);
+	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
 	if (back > 0)
-		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (back << 2)), cnt);
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
 	if (forw > 0)
-		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (forw << 2)), cnt);
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
 }
 
 
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 14abb8b835f7..09c39e5e6386 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -20,15 +20,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_CYLINDER_DEBUG
-
-#ifdef UFS_CYLINDER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-
 /*
  * Read cylinder group into cache. The memory space for ufs_cg_private_info
  * structure is already allocated during ufs_read_super.
@@ -42,19 +33,19 @@ static void ufs_read_cylinder (struct super_block * sb,
 	struct ufs_cylinder_group * ucg;
 	unsigned i, j;
 
-	UFSD(("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr))
+	UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
 	uspi = sbi->s_uspi;
 	ucpi = sbi->s_ucpi[bitmap_nr];
 	ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
 
-	UCPI_UBH->fragment = ufs_cgcmin(cgno);
-	UCPI_UBH->count = uspi->s_cgsize >> sb->s_blocksize_bits;
+	UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
+	UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
 	/*
 	 * We have already the first fragment of cylinder group block in buffer
 	 */
-	UCPI_UBH->bh[0] = sbi->s_ucg[cgno];
-	for (i = 1; i < UCPI_UBH->count; i++)
-		if (!(UCPI_UBH->bh[i] = sb_bread(sb, UCPI_UBH->fragment + i)))
+	UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
+		if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
 			goto failed;
 	sbi->s_cgno[bitmap_nr] = cgno;
 			
@@ -73,7 +64,7 @@ static void ufs_read_cylinder (struct super_block * sb,
 	ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
 	ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
 	ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;	
 	
 failed:
@@ -95,15 +86,15 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	struct ufs_cylinder_group * ucg;
 	unsigned i;
 
-	UFSD(("ENTER, bitmap_nr %u\n", bitmap_nr))
+	UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
 
 	uspi = sbi->s_uspi;
 	if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
-		UFSD(("EXIT\n"))
+		UFSD("EXIT\n");
 		return;
 	}
 	ucpi = sbi->s_ucpi[bitmap_nr];
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
 		ufs_panic (sb, "ufs_put_cylinder", "internal error");
@@ -116,13 +107,13 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
 	ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
 	ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
-	ubh_mark_buffer_dirty (UCPI_UBH);
-	for (i = 1; i < UCPI_UBH->count; i++) {
-		brelse (UCPI_UBH->bh[i]);
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+		brelse (UCPI_UBH(ucpi)->bh[i]);
 	}
 
 	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 /*
@@ -139,7 +130,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 	struct ufs_cg_private_info * ucpi;
 	unsigned cg, i, j;
 
-	UFSD(("ENTER, cgno %u\n", cgno))
+	UFSD("ENTER, cgno %u\n", cgno);
 
 	uspi = sbi->s_uspi;
 	if (cgno >= uspi->s_ncg) {
@@ -150,7 +141,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 	 * Cylinder group number cg it in cache and it was last used
 	 */
 	if (sbi->s_cgno[0] == cgno) {
-		UFSD(("EXIT\n"))
+		UFSD("EXIT\n");
 		return sbi->s_ucpi[0];
 	}
 	/*
@@ -160,16 +151,16 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 		if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
 			if (sbi->s_cgno[cgno] != cgno) {
 				ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
-				UFSD(("EXIT (FAILED)\n"))
+				UFSD("EXIT (FAILED)\n");
 				return NULL;
 			}
 			else {
-				UFSD(("EXIT\n"))
+				UFSD("EXIT\n");
 				return sbi->s_ucpi[cgno];
 			}
 		} else {
 			ufs_read_cylinder (sb, cgno, cgno);
-			UFSD(("EXIT\n"))
+			UFSD("EXIT\n");
 			return sbi->s_ucpi[cgno];
 		}
 	}
@@ -204,6 +195,6 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 		sbi->s_ucpi[0] = ucpi;
 		ufs_read_cylinder (sb, cgno, 0);
 	}
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return sbi->s_ucpi[0];
 }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 1a561202d3f4..7f0a0aa63584 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -11,31 +11,20 @@
  * 4.4BSD (FreeBSD) support added on February 1st 1998 by
  * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
  * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
  */
 
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include <linux/sched.h>
 
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_DIR_DEBUG
-
-#ifdef UFS_DIR_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-static int
-ufs_check_dir_entry (const char *, struct inode *, struct ufs_dir_entry *,
-		     struct buffer_head *, unsigned long);
-
-
 /*
  * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
  *
@@ -51,495 +40,541 @@ static inline int ufs_match(struct super_block *sb, int len,
 	return !memcmp(name, de->d_name, len);
 }
 
-/*
- * This is blatantly stolen from ext2fs
- */
-static int
-ufs_readdir (struct file * filp, void * dirent, filldir_t filldir)
+static int ufs_commit_chunk(struct page *page, unsigned from, unsigned to)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	int error = 0;
-	unsigned long offset, lblk;
-	int i, stored;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de;
-	struct super_block * sb;
-	int de_reclen;
-	unsigned flags;
-	u64     blk= 0L;
-
-	lock_kernel();
-
-	sb = inode->i_sb;
-	flags = UFS_SB(sb)->s_flags;
-
-	UFSD(("ENTER, ino %lu  f_pos %lu\n", inode->i_ino, (unsigned long) filp->f_pos))
-
-	stored = 0;
-	bh = NULL;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
-
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-		lblk = (filp->f_pos) >> sb->s_blocksize_bits;
-		blk = ufs_frag_map(inode, lblk);
-		if (!blk || !(bh = sb_bread(sb, blk))) {
-			/* XXX - error - skip to the next block */
-			printk("ufs_readdir: "
-			       "dir inode %lu has a hole at offset %lu\n",
-			       inode->i_ino, (unsigned long int)filp->f_pos);
-			filp->f_pos += sb->s_blocksize - offset;
-			continue;
-		}
-
-revalidate:
-		/* If the dir block has changed since the last call to
-		 * readdir(2), then we might be pointing to an invalid
-		 * dirent right now.  Scan from the start of the block
-		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
-			for (i = 0; i < sb->s_blocksize && i < offset; ) {
-				de = (struct ufs_dir_entry *)(bh->b_data + i);
-				/* It's too expensive to do a full
-				 * dirent test each time round this
-				 * loop, but we do have to test at
-				 * least that it is non-zero.  A
-				 * failure will be detected in the
-				 * dirent test below. */
-				de_reclen = fs16_to_cpu(sb, de->d_reclen);
-				if (de_reclen < 1)
-					break;
-				i += de_reclen;
-			}
-			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
-				| offset;
-			filp->f_version = inode->i_version;
-		}
+	struct inode *dir = page->mapping->host;
+	int err = 0;
+	dir->i_version++;
+	page->mapping->a_ops->commit_write(NULL, page, from, to);
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
 
-		while (!error && filp->f_pos < inode->i_size
-		       && offset < sb->s_blocksize) {
-			de = (struct ufs_dir_entry *) (bh->b_data + offset);
-			/* XXX - put in a real ufs_check_dir_entry() */
-			if ((de->d_reclen == 0) || (ufs_get_de_namlen(sb, de) == 0)) {
-				filp->f_pos = (filp->f_pos &
-				              (sb->s_blocksize - 1)) +
-				               sb->s_blocksize;
-				brelse(bh);
-				unlock_kernel();
-				return stored;
-			}
-			if (!ufs_check_dir_entry ("ufs_readdir", inode, de,
-						   bh, offset)) {
-				/* On error, skip the f_pos to the
-				   next block. */
-				filp->f_pos = (filp->f_pos |
-				              (sb->s_blocksize - 1)) +
-					       1;
-				brelse (bh);
-				unlock_kernel();
-				return stored;
-			}
-			offset += fs16_to_cpu(sb, de->d_reclen);
-			if (de->d_ino) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation. */
-				unsigned long version = filp->f_version;
-				unsigned char d_type = DT_UNKNOWN;
+static inline void ufs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
 
-				UFSD(("filldir(%s,%u)\n", de->d_name,
-							fs32_to_cpu(sb, de->d_ino)))
-				UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)))
+static inline unsigned long ufs_dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
 
-				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
-					d_type = de->d_u.d_44.d_type;
-				error = filldir(dirent, de->d_name,
-						ufs_get_de_namlen(sb, de), filp->f_pos,
-						fs32_to_cpu(sb, de->d_ino), d_type);
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored ++;
-			}
-			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
-		}
-		offset = 0;
-		brelse (bh);
+ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct ufs_dir_entry *de;
+	struct page *page;
+	
+	de = ufs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = fs32_to_cpu(dir->i_sb, de->d_ino);
+		ufs_put_page(page);
 	}
-	unlock_kernel();
-	return 0;
+	return res;
 }
 
-/*
- * define how far ahead to read directories while searching them.
- */
-#define NAMEI_RA_CHUNKS  2
-#define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
-/*
- *	ufs_find_entry()
- *
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_bh). It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- */
-struct ufs_dir_entry * ufs_find_entry (struct dentry *dentry,
-	struct buffer_head ** res_bh)
+/* Releases the page */
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+		  struct page *page, struct inode *inode)
 {
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh_read[NAMEI_RA_SIZE];
-	unsigned long offset;
-	int block, toread, i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	unsigned from = (char *) de - (char *) page_address(page);
+	unsigned to = from + fs16_to_cpu(dir->i_sb, de->d_reclen);
+	int err;
 
-	UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen))
-	
-	*res_bh = NULL;
-	
-	sb = dir->i_sb;
-	
-	if (namelen > UFS_MAXNAMLEN)
-		return NULL;
+	lock_page(page);
+	err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+	BUG_ON(err);
+	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
+	ufs_set_de_type(dir->i_sb, de, inode->i_mode);
+	err = ufs_commit_chunk(page, from, to);
+	ufs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
 
-	memset (bh_use, 0, sizeof (bh_use));
-	toread = 0;
-	for (block = 0; block < NAMEI_RA_SIZE; ++block) {
-		struct buffer_head * bh;
 
-		if ((block << sb->s_blocksize_bits) >= dir->i_size)
-			break;
-		bh = ufs_getfrag (dir, block, 0, &err);
-		bh_use[block] = bh;
-		if (bh && !buffer_uptodate(bh))
-			bh_read[toread++] = bh;
+static void ufs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct ufs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (UFS_SECTOR_SIZE - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
 	}
+	for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct ufs_dir_entry *)(kaddr + offs);
+		rec_len = fs16_to_cpu(sb, p->d_reclen);
+
+		if (rec_len < UFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1))
+			goto Espan;
+		if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
+						  UFS_SB(sb)->s_uspi->s_ncg))
+			goto Einumber;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	ufs_error(sb, "ufs_check_page",
+		  "size of directory #%lu is not a multiple of chunk size",
+		  dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "inode out of bounds";
+bad_entry:
+	ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
+		   "offset=%lu, rec_len=%d, name_len=%d",
+		   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		   rec_len, ufs_get_de_namlen(sb, p));
+	goto fail;
+Eend:
+	p = (struct ufs_dir_entry *)(kaddr + offs);
+	ufs_error (sb, "ext2_check_page",
+		   "entry in directory #%lu spans the page boundary"
+		   "offset=%lu",
+		   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
 
-	for (block = 0, offset = 0; offset < dir->i_size; block++) {
-		struct buffer_head * bh;
-		struct ufs_dir_entry * de;
-		char * dlimit;
-
-		if ((block % NAMEI_RA_BLOCKS) == 0 && toread) {
-			ll_rw_block (READ, toread, bh_read);
-			toread = 0;
-		}
-		bh = bh_use[block % NAMEI_RA_SIZE];
-		if (!bh) {
-			ufs_error (sb, "ufs_find_entry", 
-				"directory #%lu contains a hole at offset %lu",
-				dir->i_ino, offset);
-			offset += sb->s_blocksize;
-			continue;
-		}
-		wait_on_buffer (bh);
-		if (!buffer_uptodate(bh)) {
-			/*
-			 * read error: all bets are off
-			 */
-			break;
-		}
-
-		de = (struct ufs_dir_entry *) bh->b_data;
-		dlimit = bh->b_data + sb->s_blocksize;
-		while ((char *) de < dlimit && offset < dir->i_size) {
-			/* this code is executed quadratically often */
-			/* do minimal checking by hand */
-			int de_len;
-
-			if ((char *) de + namelen <= dlimit &&
-			    ufs_match(sb, namelen, name, de)) {
-				/* found a match -
-				just to be sure, do a full check */
-				if (!ufs_check_dir_entry("ufs_find_entry",
-				    dir, de, bh, offset))
-					goto failed;
-				for (i = 0; i < NAMEI_RA_SIZE; ++i) {
-					if (bh_use[i] != bh)
-						brelse (bh_use[i]);
-				}
-				*res_bh = bh;
-				return de;
-			}
-                        /* prevent looping on a bad block */
-			de_len = fs16_to_cpu(sb, de->d_reclen);
-			if (de_len <= 0)
-				goto failed;
-			offset += de_len;
-			de = (struct ufs_dir_entry *) ((char *) de + de_len);
-		}
-
-		brelse (bh);
-		if (((block + NAMEI_RA_SIZE) << sb->s_blocksize_bits ) >=
-		    dir->i_size)
-			bh = NULL;
-		else
-			bh = ufs_getfrag (dir, block + NAMEI_RA_SIZE, 0, &err);
-		bh_use[block % NAMEI_RA_SIZE] = bh;
-		if (bh && !buffer_uptodate(bh))
-			bh_read[toread++] = bh;
+static struct page *ufs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_cache_page(mapping, n,
+				(filler_t*)mapping->a_ops->readpage, NULL);
+	if (!IS_ERR(page)) {
+		wait_on_page_locked(page);
+		kmap(page);
+		if (!PageUptodate(page))
+			goto fail;
+		if (!PageChecked(page))
+			ufs_check_page(page);
+		if (PageError(page))
+			goto fail;
 	}
+	return page;
 
-failed:
-	for (i = 0; i < NAMEI_RA_SIZE; ++i) brelse (bh_use[i]);
-	UFSD(("EXIT\n"))
-	return NULL;
+fail:
+	ufs_put_page(page);
+	return ERR_PTR(-EIO);
 }
 
-static int
-ufs_check_dir_entry (const char *function, struct inode *dir,
-		     struct ufs_dir_entry *de, struct buffer_head *bh,
-		     unsigned long offset)
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ufs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-	struct super_block *sb = dir->i_sb;
-	const char *error_msg = NULL;
-	int rlen = fs16_to_cpu(sb, de->d_reclen);
-
-	if (rlen < UFS_DIR_REC_LEN(1))
-		error_msg = "reclen is smaller than minimal";
-	else if (rlen % 4 != 0)
-		error_msg = "reclen % 4 != 0";
-	else if (rlen < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)))
-		error_msg = "reclen is too small for namlen";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-		error_msg = "directory entry across blocks";
-	else if (fs32_to_cpu(sb, de->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
-				      UFS_SB(sb)->s_uspi->s_ncg))
-		error_msg = "inode out of bounds";
-
-	if (error_msg != NULL)
-		ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - "
-			    "offset=%lu, inode=%lu, reclen=%d, namlen=%d",
-			    dir->i_ino, dir->i_size, error_msg, offset,
-			    (unsigned long)fs32_to_cpu(sb, de->d_ino),
-			    rlen, ufs_get_de_namlen(sb, de));
-	
-	return (error_msg == NULL ? 1 : 0);
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
 }
 
-struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct buffer_head **p)
+static inline struct ufs_dir_entry *
+ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
 {
-	int err;
-	struct buffer_head *bh = ufs_bread (dir, 0, 0, &err);
-	struct ufs_dir_entry *res = NULL;
-
-	if (bh) {
-		res = (struct ufs_dir_entry *) bh->b_data;
-		res = (struct ufs_dir_entry *)((char *)res +
-			fs16_to_cpu(dir->i_sb, res->d_reclen));
-	}
-	*p = bh;
-	return res;
+	return (struct ufs_dir_entry *)((char *)p +
+					fs16_to_cpu(sb, p->d_reclen));
 }
-ino_t ufs_inode_by_name(struct inode * dir, struct dentry *dentry)
+
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
 {
-	ino_t res = 0;
-	struct ufs_dir_entry * de;
-	struct buffer_head *bh;
+	struct page *page = ufs_get_page(dir, 0);
+	struct ufs_dir_entry *de = NULL;
 
-	de = ufs_find_entry (dentry, &bh);
-	if (de) {
-		res = fs32_to_cpu(dir->i_sb, de->d_ino);
-		brelse(bh);
+	if (!IS_ERR(page)) {
+		de = ufs_next_entry(dir->i_sb,
+				    (struct ufs_dir_entry *)page_address(page));
+		*p = page;
 	}
-	return res;
+	return de;
 }
 
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
-		struct buffer_head *bh, struct inode *inode)
+/*
+ *	ufs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+				     struct page **res_page)
 {
-	dir->i_version++;
-	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
-	mark_buffer_dirty(bh);
-	if (IS_DIRSYNC(dir))
-		sync_dirty_buffer(bh);
-	brelse (bh);
+	struct super_block *sb = dir->i_sb;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = ufs_dir_pages(dir);
+	struct page *page = NULL;
+	struct ufs_inode_info *ui = UFS_I(dir);
+	struct ufs_dir_entry *de;
+
+	UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
+
+	if (npages == 0 || namelen > UFS_MAXNAMLEN)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ui->i_dir_start_lookup;
+
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = ufs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct ufs_dir_entry *) kaddr;
+			kaddr += ufs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->d_reclen == 0) {
+					ufs_error(dir->i_sb, __FUNCTION__,
+						  "zero-length directory entry");
+					ufs_put_page(page);
+					goto out;
+				}
+				if (ufs_match(sb, namelen, name, de))
+					goto found;
+				de = ufs_next_entry(sb, de);
+			}
+			ufs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ui->i_dir_start_lookup = n;
+	return de;
 }
 
 /*
- *	ufs_add_entry()
- *
- * adds a file entry to the specified directory, using the same
- * semantics as ufs_find_entry(). It returns NULL if it failed.
+ *	Parent is locked.
  */
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
-	unsigned long offset;
-	unsigned fragoff;
-	unsigned short rec_len;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de, * de1;
 	struct inode *dir = dentry->d_parent->d_inode;
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
+	struct super_block *sb = dir->i_sb;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct ufs_dir_entry *de;
+	unsigned long npages = ufs_dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	unsigned from, to;
 	int err;
 
-	UFSD(("ENTER, name %s, namelen %u\n", name, namelen))
-	
-	sb = dir->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
-
-	if (!namelen)
-		return -EINVAL;
-	bh = ufs_bread (dir, 0, 0, &err);
-	if (!bh)
-		return err;
-	rec_len = UFS_DIR_REC_LEN(namelen);
-	offset = 0;
-	de = (struct ufs_dir_entry *) bh->b_data;
-	while (1) {
-		if ((char *)de >= UFS_SECTOR_SIZE + bh->b_data) {
-			fragoff = offset & ~uspi->s_fmask;
-			if (fragoff != 0 && fragoff != UFS_SECTOR_SIZE)
-				ufs_error (sb, "ufs_add_entry", "internal error"
-					" fragoff %u", fragoff);
-			if (!fragoff) {
-				brelse (bh);
-				bh = ufs_bread (dir, offset >> sb->s_blocksize_bits, 1, &err);
-				if (!bh)
-					return err;
-			}
-			if (dir->i_size <= offset) {
-				if (dir->i_size == 0) {
-					brelse(bh);
-					return -ENOENT;
-				}
-				de = (struct ufs_dir_entry *) (bh->b_data + fragoff);
-				de->d_ino = 0;
+	UFSD("ENTER, name %s, namelen %u\n", name, namelen);
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = ufs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + ufs_last_byte(dir, n);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = UFS_SECTOR_SIZE;
 				de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE);
-				ufs_set_de_namlen(sb, de, 0);
-				dir->i_size = offset + UFS_SECTOR_SIZE;
-				mark_inode_dirty(dir);
-			} else {
-				de = (struct ufs_dir_entry *) bh->b_data;
+				de->d_ino = 0;
+				goto got_it;
 			}
+			if (de->d_reclen == 0) {
+				ufs_error(dir->i_sb, __FUNCTION__,
+					  "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (ufs_match(sb, namelen, name, de))
+				goto out_unlock;
+			name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
+			rec_len = fs16_to_cpu(sb, de->d_reclen);
+			if (!de->d_ino && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct ufs_dir_entry *) ((char *) de + rec_len);
 		}
-		if (!ufs_check_dir_entry ("ufs_add_entry", dir, de, bh, offset)) {
-			brelse (bh);
-			return -ENOENT;
-		}
-		if (ufs_match(sb, namelen, name, de)) {
-			brelse (bh);
-			return -EEXIST;
-		}
-		if (de->d_ino == 0 && fs16_to_cpu(sb, de->d_reclen) >= rec_len)
-			break;
-			
-		if (fs16_to_cpu(sb, de->d_reclen) >=
-		     UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)) + rec_len)
-			break;
-		offset += fs16_to_cpu(sb, de->d_reclen);
-		de = (struct ufs_dir_entry *) ((char *) de + fs16_to_cpu(sb, de->d_reclen));
+		unlock_page(page);
+		ufs_put_page(page);
 	}
-
+	BUG();
+	return -EINVAL;
+
+got_it:
+	from = (char*)de - (char*)page_address(page);
+	to = from + rec_len;
+	err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+	if (err)
+		goto out_unlock;
 	if (de->d_ino) {
-		de1 = (struct ufs_dir_entry *) ((char *) de +
-			UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-		de1->d_reclen =
-			cpu_to_fs16(sb, fs16_to_cpu(sb, de->d_reclen) -
-				UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-		de->d_reclen =
-			cpu_to_fs16(sb, UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
+		struct ufs_dir_entry *de1 =
+			(struct ufs_dir_entry *) ((char *) de + name_len);
+		de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
+		de->d_reclen = cpu_to_fs16(sb, name_len);
+
 		de = de1;
 	}
-	de->d_ino = 0;
+
 	ufs_set_de_namlen(sb, de, namelen);
-	memcpy (de->d_name, name, namelen + 1);
+	memcpy(de->d_name, name, namelen + 1);
 	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
 	ufs_set_de_type(sb, de, inode->i_mode);
-	mark_buffer_dirty(bh);
-	if (IS_DIRSYNC(dir))
-		sync_dirty_buffer(bh);
-	brelse (bh);
+
+	err = ufs_commit_chunk(page, from, to);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-	dir->i_version++;
+
 	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ufs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
 
-	UFSD(("EXIT\n"))
+static inline unsigned
+ufs_validate_entry(struct super_block *sb, char *base,
+		   unsigned offset, unsigned mask)
+{
+	struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
+	struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
+	while ((char*)p < (char*)de) {
+		if (p->d_reclen == 0)
+			break;
+		p = ufs_next_entry(sb, p);
+	}
+	return (char *)p - base;
+}
+
+
+/*
+ * This is blatantly stolen from ext2fs
+ */
+static int
+ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = ufs_dir_pages(inode);
+	unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1);
+	int need_revalidate = filp->f_version != inode->i_version;
+	unsigned flags = UFS_SB(sb)->s_flags;
+
+	UFSD("BEGIN\n");
+
+	if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
+		return 0;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct ufs_dir_entry *de;
+
+		struct page *page = ufs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			ufs_error(sb, __FUNCTION__,
+				  "bad page in #%lu",
+				  inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct ufs_dir_entry *)(kaddr+offset);
+		limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
+		for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
+			if (de->d_reclen == 0) {
+				ufs_error(sb, __FUNCTION__,
+					"zero-length directory entry");
+				ufs_put_page(page);
+				return -EIO;
+			}
+			if (de->d_ino) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				offset = (char *)de - kaddr;
+
+				UFSD("filldir(%s,%u)\n", de->d_name,
+				      fs32_to_cpu(sb, de->d_ino));
+				UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
+
+				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+					d_type = de->d_u.d_44.d_type;
+
+				over = filldir(dirent, de->d_name,
+					       ufs_get_de_namlen(sb, de),
+						(n<<PAGE_CACHE_SHIFT) | offset,
+					       fs32_to_cpu(sb, de->d_ino), d_type);
+				if (over) {
+					ufs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+		}
+		ufs_put_page(page);
+	}
 	return 0;
 }
 
+
 /*
  * ufs_delete_entry deletes a directory entry by merging it with the
  * previous entry.
  */
-int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir,
-	struct buffer_head * bh )
-	
+int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
+		     struct page * page)
 {
-	struct super_block * sb;
-	struct ufs_dir_entry * de, * pde;
-	unsigned i;
-	
-	UFSD(("ENTER\n"))
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = page->mapping;
+	char *kaddr = page_address(page);
+	unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1);
+	unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+	struct ufs_dir_entry *pde = NULL;
+	struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+	int err;
 
-	sb = inode->i_sb;
-	i = 0;
-	pde = NULL;
-	de = (struct ufs_dir_entry *) bh->b_data;
-	
-	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
-		fs32_to_cpu(sb, de->d_ino),
-		fs16_to_cpu(sb, de->d_reclen),
-		ufs_get_de_namlen(sb, de), de->d_name))
-
-	while (i < bh->b_size) {
-		if (!ufs_check_dir_entry ("ufs_delete_entry", inode, de, bh, i)) {
-			brelse(bh);
-			return -EIO;
-		}
-		if (de == dir)  {
-			if (pde)
-				fs16_add(sb, &pde->d_reclen,
-					fs16_to_cpu(sb, dir->d_reclen));
-			dir->d_ino = 0;
-			inode->i_version++;
-			inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
-			mark_inode_dirty(inode);
-			mark_buffer_dirty(bh);
-			if (IS_DIRSYNC(inode))
-				sync_dirty_buffer(bh);
-			brelse(bh);
-			UFSD(("EXIT\n"))
-			return 0;
+	UFSD("ENTER\n");
+
+	UFSD("ino %u, reclen %u, namlen %u, name %s\n",
+	      fs32_to_cpu(sb, de->d_ino),
+	      fs16_to_cpu(sb, de->d_reclen),
+	      ufs_get_de_namlen(sb, de), de->d_name);
+
+	while ((char*)de < (char*)dir) {
+		if (de->d_reclen == 0) {
+			ufs_error(inode->i_sb, __FUNCTION__,
+				  "zero-length directory entry");
+			err = -EIO;
+			goto out;
 		}
-		i += fs16_to_cpu(sb, de->d_reclen);
-		if (i == UFS_SECTOR_SIZE) pde = NULL;
-		else pde = de;
-		de = (struct ufs_dir_entry *)
-		    ((char *) de + fs16_to_cpu(sb, de->d_reclen));
-		if (i == UFS_SECTOR_SIZE && de->d_reclen == 0)
-			break;
+		pde = de;
+		de = ufs_next_entry(sb, de);
 	}
-	UFSD(("EXIT\n"))
-	brelse(bh);
-	return -ENOENT;
+	if (pde)
+		from = (char*)pde - (char*)page_address(page);
+	lock_page(page);
+	err = mapping->a_ops->prepare_write(NULL, page, from, to);
+	BUG_ON(err);
+	if (pde)
+		pde->d_reclen = cpu_to_fs16(sb, to-from);
+	dir->d_ino = 0;
+	err = ufs_commit_chunk(page, from, to);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+out:
+	ufs_put_page(page);
+	UFSD("EXIT\n");
+	return err;
 }
 
 int ufs_make_empty(struct inode * inode, struct inode *dir)
 {
 	struct super_block * sb = dir->i_sb;
-	struct buffer_head * dir_block;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
 	struct ufs_dir_entry * de;
+	char *base;
 	int err;
 
-	dir_block = ufs_bread (inode, 0, 1, &err);
-	if (!dir_block)
-		return err;
+	if (!page)
+		return -ENOMEM;
+	kmap(page);
+	err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+
+	base = (char*)page_address(page);
+	memset(base, 0, PAGE_CACHE_SIZE);
+
+	de = (struct ufs_dir_entry *) base;
 
-	inode->i_blocks = sb->s_blocksize / UFS_SECTOR_SIZE;
-	de = (struct ufs_dir_entry *) dir_block->b_data;
 	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
 	ufs_set_de_type(sb, de, inode->i_mode);
 	ufs_set_de_namlen(sb, de, 1);
@@ -552,72 +587,65 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
 	de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1));
 	ufs_set_de_namlen(sb, de, 2);
 	strcpy (de->d_name, "..");
-	mark_buffer_dirty(dir_block);
-	brelse (dir_block);
-	mark_inode_dirty(inode);
-	return 0;
+
+	err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE);
+fail:
+	kunmap(page);
+	page_cache_release(page);
+	return err;
 }
 
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-int ufs_empty_dir (struct inode * inode)
+int ufs_empty_dir(struct inode * inode)
 {
-	struct super_block * sb;
-	unsigned long offset;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de, * de1;
-	int err;
-	
-	sb = inode->i_sb;
-
-	if (inode->i_size < UFS_DIR_REC_LEN(1) + UFS_DIR_REC_LEN(2) ||
-	    !(bh = ufs_bread (inode, 0, 0, &err))) {
-	    	ufs_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no data block",
-			      inode->i_ino);
-		return 1;
-	}
-	de = (struct ufs_dir_entry *) bh->b_data;
-	de1 = (struct ufs_dir_entry *)
-		((char *)de + fs16_to_cpu(sb, de->d_reclen));
-	if (fs32_to_cpu(sb, de->d_ino) != inode->i_ino || de1->d_ino == 0 ||
-	     strcmp (".", de->d_name) || strcmp ("..", de1->d_name)) {
-	    	ufs_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		return 1;
-	}
-	offset = fs16_to_cpu(sb, de->d_reclen) + fs16_to_cpu(sb, de1->d_reclen);
-	de = (struct ufs_dir_entry *)
-		((char *)de1 + fs16_to_cpu(sb, de1->d_reclen));
-	while (offset < inode->i_size ) {
-		if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) {
-			brelse (bh);
-			bh = ufs_bread (inode, offset >> sb->s_blocksize_bits, 1, &err);
-	 		if (!bh) {
-				ufs_error (sb, "empty_dir",
-					    "directory #%lu contains a hole at offset %lu",
-					    inode->i_ino, offset);
-				offset += sb->s_blocksize;
-				continue;
+	struct super_block *sb = inode->i_sb;
+	struct page *page = NULL;
+	unsigned long i, npages = ufs_dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct ufs_dir_entry *de;
+		page = ufs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->d_reclen == 0) {
+				ufs_error(inode->i_sb, __FUNCTION__,
+					"zero-length directory entry: "
+					"kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
 			}
-			de = (struct ufs_dir_entry *) bh->b_data;
-		}
-		if (!ufs_check_dir_entry ("empty_dir", inode, de, bh, offset)) {
-			brelse (bh);
-			return 1;
-		}
-		if (de->d_ino) {
-			brelse (bh);
-			return 0;
+			if (de->d_ino) {
+				u16 namelen=ufs_get_de_namlen(sb, de);
+				/* check for . and .. */
+				if (de->d_name[0] != '.')
+					goto not_empty;
+				if (namelen > 2)
+					goto not_empty;
+				if (namelen < 2) {
+					if (inode->i_ino !=
+					    fs32_to_cpu(sb, de->d_ino))
+						goto not_empty;
+				} else if (de->d_name[1] != '.')
+					goto not_empty;
+			}
+			de = ufs_next_entry(sb, de);
 		}
-		offset += fs16_to_cpu(sb, de->d_reclen);
-		de = (struct ufs_dir_entry *)
-			((char *)de + fs16_to_cpu(sb, de->d_reclen));
+		ufs_put_page(page);
 	}
-	brelse (bh);
 	return 1;
+
+not_empty:
+	ufs_put_page(page);
+	return 0;
 }
 
 const struct file_operations ufs_dir_operations = {
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 312fd3f86313..0e5001512a9d 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -25,6 +25,26 @@
 
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
+#include <linux/buffer_head.h>	/* for sync_mapping_buffers() */
+
+static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = ufs_sync_inode(inode);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+
 
 /*
  * We have mostly NULL's here: the current defaults are ok for
@@ -37,6 +57,7 @@ const struct file_operations ufs_file_operations = {
 	.write		= generic_file_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
+	.fsync		= ufs_sync_file,
 	.sendfile	= generic_file_sendfile,
 };
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c7a47ed4f430..9501dcd3b213 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -34,14 +34,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_IALLOC_DEBUG
-
-#ifdef UFS_IALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 /*
  * NOTE! When we get the inode, we're the only people
  * that have access to it, and as such there are no
@@ -68,7 +60,7 @@ void ufs_free_inode (struct inode * inode)
 	int is_directory;
 	unsigned ino, cg, bit;
 	
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -91,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
 		unlock_super (sb);
 		return;
 	}
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg))
 		ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
 
@@ -104,33 +96,33 @@ void ufs_free_inode (struct inode * inode)
 
 	clear_inode (inode);
 
-	if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
 		ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
 	else {
-		ubh_clrbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 		if (ino < ucpi->c_irotor)
 			ucpi->c_irotor = ino;
 		fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nifree, 1);
+		uspi->cs_total.cs_nifree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
 
 		if (is_directory) {
 			fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
-			fs32_sub(sb, &usb1->fs_cstotal.cs_ndir, 1);
+			uspi->cs_total.cs_ndir--;
 			fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
 		}
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	
 	sb->s_dirt = 1;
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 /*
@@ -155,7 +147,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode)
 	unsigned cg, bit, i, j, start;
 	struct ufs_inode_info *ufsi;
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -213,43 +205,43 @@ cg_found:
 	ucpi = ufs_load_cylinder (sb, cg);
 	if (!ucpi)
 		goto failed;
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) 
 		ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
 
 	start = ucpi->c_irotor;
-	bit = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_iusedoff, uspi->s_ipg, start);
+	bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
 	if (!(bit < uspi->s_ipg)) {
-		bit = ubh_find_first_zero_bit (UCPI_UBH, ucpi->c_iusedoff, start);
+		bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
 		if (!(bit < start)) {
 			ufs_error (sb, "ufs_new_inode",
 			    "cylinder group %u corrupted - error in inode bitmap\n", cg);
 			goto failed;
 		}
 	}
-	UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg))
-	if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
-		ubh_setbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+	UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
+		ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 	else {
 		ufs_panic (sb, "ufs_new_inode", "internal error");
 		goto failed;
 	}
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nifree, 1);
+	uspi->cs_total.cs_nifree--;
 	fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
 	
 	if (S_ISDIR(mode)) {
 		fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_ndir, 1);
+		uspi->cs_total.cs_ndir++;
 		fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
@@ -272,6 +264,7 @@ cg_found:
 	ufsi->i_shadow = 0;
 	ufsi->i_osync = 0;
 	ufsi->i_oeftflag = 0;
+	ufsi->i_dir_start_lookup = 0;
 	memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
 
 	insert_inode_hash(inode);
@@ -287,14 +280,14 @@ cg_found:
 		return ERR_PTR(-EDQUOT);
 	}
 
-	UFSD(("allocating inode %lu\n", inode->i_ino))
-	UFSD(("EXIT\n"))
+	UFSD("allocating inode %lu\n", inode->i_ino);
+	UFSD("EXIT\n");
 	return inode;
 
 failed:
 	unlock_super (sb);
 	make_bad_inode(inode);
 	iput (inode);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return ERR_PTR(-ENOSPC);
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3c3f62ce2ad9..f2dbdf5a8769 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,14 +41,7 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_INODE_DEBUG
-#undef UFS_INODE_DEBUG_MORE
-
-#ifdef UFS_INODE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
+static u64 ufs_frag_map(struct inode *inode, sector_t frag);
 
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
@@ -61,7 +54,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 	int n = 0;
 
 
-	UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks));
+	UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
 	if (i_block < 0) {
 		ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
@@ -89,7 +82,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
  * the begining of the filesystem.
  */
 
-u64  ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -104,8 +97,8 @@ u64  ufs_frag_map(struct inode *inode, sector_t frag)
 	unsigned flags = UFS_SB(sb)->s_flags;
 	u64 temp = 0L;
 
-	UFSD((": frag = %llu  depth = %d\n", (unsigned long long)frag, depth));
-	UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask));
+	UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
+	UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask);
 
 	if (depth == 0)
 		return 0;
@@ -161,26 +154,64 @@ out:
 	return ret;
 }
 
-static struct buffer_head * ufs_inode_getfrag (struct inode *inode,
-	unsigned int fragment, unsigned int new_fragment,
-	unsigned int required, int *err, int metadata, long *phys, int *new)
+static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	unlock_buffer(bh);
+	if (IS_SYNC(inode))
+		sync_dirty_buffer(bh);
+}
+
+static struct buffer_head *
+ufs_clear_frags(struct inode *inode, sector_t beg,
+		unsigned int n)
+{
+	struct buffer_head *res, *bh;
+	sector_t end = beg + n;
+
+	res = sb_getblk(inode->i_sb, beg);
+	ufs_clear_frag(inode, res);
+	for (++beg; beg < end; ++beg) {
+		bh = sb_getblk(inode->i_sb, beg);
+		ufs_clear_frag(inode, bh);
+		brelse(bh);
+	}
+	return res;
+}
+
+/**
+ * ufs_inode_getfrag() - allocate new fragment(s)
+ * @inode - pointer to inode
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated fragment(s)
+ * @new_fragment - number of new allocated fragment(s)
+ * @required - how many fragment(s) we require
+ * @err - we set it if something wrong
+ * @phys - pointer to where we save physical number of new allocated fragments,
+ *   NULL if we allocate not data(indirect blocks for example).
+ * @new - we set it if we allocate new block
+ * @locked_page - for ufs_new_fragments()
+ */
+static struct buffer_head *
+ufs_inode_getfrag(struct inode *inode, unsigned int fragment,
+		  sector_t new_fragment, unsigned int required, int *err,
+		  long *phys, int *new, struct page *locked_page)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
 	struct buffer_head * result;
 	unsigned block, blockoff, lastfrag, lastblock, lastblockoff;
 	unsigned tmp, goal;
 	__fs32 * p, * p2;
-	unsigned flags = 0;
 
-	UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
-		inode->i_ino, fragment, new_fragment, required))         
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, required %u, "
+	     "metadata %d\n", inode->i_ino, fragment,
+	     (unsigned long long)new_fragment, required, !phys);
 
-	sb = inode->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
-
-	flags = UFS_SB(sb)->s_flags;
         /* TODO : to be done for write support
         if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
              goto ufs2;
@@ -195,16 +226,16 @@ repeat:
 	tmp = fs32_to_cpu(sb, *p);
 	lastfrag = ufsi->i_lastfrag;
 	if (tmp && fragment < lastfrag) {
-		if (metadata) {
+		if (!phys) {
 			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
 			if (tmp == fs32_to_cpu(sb, *p)) {
-				UFSD(("EXIT, result %u\n", tmp + blockoff))
+				UFSD("EXIT, result %u\n", tmp + blockoff);
 				return result;
 			}
 			brelse (result);
 			goto repeat;
 		} else {
-			*phys = tmp;
+			*phys = tmp + blockoff;
 			return NULL;
 		}
 	}
@@ -221,7 +252,8 @@ repeat:
 		if (lastblockoff) {
 			p2 = ufsi->i_u1.i_data + lastblock;
 			tmp = ufs_new_fragments (inode, p2, lastfrag, 
-				fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, err);
+						 fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff,
+						 err, locked_page);
 			if (!tmp) {
 				if (lastfrag != ufsi->i_lastfrag)
 					goto repeat;
@@ -233,14 +265,16 @@ repeat:
 		}
 		goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb;
 		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-			goal, required + blockoff, err);
+					 goal, required + blockoff,
+					 err, locked_page);
 	}
 	/*
 	 * We will extend last allocated block
 	 */
 	else if (lastblock == block) {
-		tmp = ufs_new_fragments (inode, p, fragment - (blockoff - lastblockoff),
-			fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff), err);
+		tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
+					fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff),
+					err, locked_page);
 	}
 	/*
 	 * We will allocate new block before last allocated block
@@ -248,8 +282,8 @@ repeat:
 	else /* (lastblock > block) */ {
 		if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1])))
 			goal = tmp + uspi->s_fpb;
-		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-			goal, uspi->s_fpb, err);
+		tmp = ufs_new_fragments(inode, p, fragment - blockoff,
+					goal, uspi->s_fpb, err, locked_page);
 	}
 	if (!tmp) {
 		if ((!blockoff && *p) || 
@@ -259,14 +293,10 @@ repeat:
 		return NULL;
 	}
 
-	/* The nullification of framgents done in ufs/balloc.c is
-	 * something I don't have the stomache to move into here right
-	 * now. -DaveM
-	 */
-	if (metadata) {
-		result = sb_getblk(inode->i_sb, tmp + blockoff);
+	if (!phys) {
+		result = ufs_clear_frags(inode, tmp + blockoff, required);
 	} else {
-		*phys = tmp;
+		*phys = tmp + blockoff;
 		result = NULL;
 		*err = 0;
 		*new = 1;
@@ -276,7 +306,7 @@ repeat:
 	if (IS_SYNC(inode))
 		ufs_sync_inode (inode);
 	mark_inode_dirty(inode);
-	UFSD(("EXIT, result %u\n", tmp + blockoff))
+	UFSD("EXIT, result %u\n", tmp + blockoff);
 	return result;
 
      /* This part : To be implemented ....
@@ -295,22 +325,35 @@ repeat2:
      */
 }
 
-static struct buffer_head * ufs_block_getfrag (struct inode *inode,
-	struct buffer_head *bh, unsigned int fragment, unsigned int new_fragment, 
-	unsigned int blocksize, int * err, int metadata, long *phys, int *new)
+/**
+ * ufs_inode_getblock() - allocate new block
+ * @inode - pointer to inode
+ * @bh - pointer to block which hold "pointer" to new allocated block
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated block
+ * @new_fragment - number of new allocated fragment
+ *  (block will hold this fragment and also uspi->s_fpb-1)
+ * @err - see ufs_inode_getfrag()
+ * @phys - see ufs_inode_getfrag()
+ * @new - see ufs_inode_getfrag()
+ * @locked_page - see ufs_inode_getfrag()
+ */
+static struct buffer_head *
+ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
+		  unsigned int fragment, sector_t new_fragment, int *err,
+		  long *phys, int *new, struct page *locked_page)
 {
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
 	struct buffer_head * result;
 	unsigned tmp, goal, block, blockoff;
 	__fs32 * p;
 
-	sb = inode->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
 	block = ufs_fragstoblks (fragment);
 	blockoff = ufs_fragnum (fragment);
 
-	UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment))	
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, metadata %d\n",
+	     inode->i_ino, fragment, (unsigned long long)new_fragment, !phys);
 
 	result = NULL;
 	if (!bh)
@@ -326,14 +369,14 @@ static struct buffer_head * ufs_block_getfrag (struct inode *inode,
 repeat:
 	tmp = fs32_to_cpu(sb, *p);
 	if (tmp) {
-		if (metadata) {
+		if (!phys) {
 			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
 			if (tmp == fs32_to_cpu(sb, *p))
 				goto out;
 			brelse (result);
 			goto repeat;
 		} else {
-			*phys = tmp;
+			*phys = tmp + blockoff;
 			goto out;
 		}
 	}
@@ -342,21 +385,19 @@ repeat:
 		goal = tmp + uspi->s_fpb;
 	else
 		goal = bh->b_blocknr + uspi->s_fpb;
-	tmp = ufs_new_fragments (inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err);
+	tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
+				uspi->s_fpb, err, locked_page);
 	if (!tmp) {
 		if (fs32_to_cpu(sb, *p))
 			goto repeat;
 		goto out;
 	}		
 
-	/* The nullification of framgents done in ufs/balloc.c is
-	 * something I don't have the stomache to move into here right
-	 * now. -DaveM
-	 */
-	if (metadata) {
-		result = sb_getblk(sb, tmp + blockoff);
+
+	if (!phys) {
+		result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb);
 	} else {
-		*phys = tmp;
+		*phys = tmp + blockoff;
 		*new = 1;
 	}
 
@@ -365,18 +406,19 @@ repeat:
 		sync_dirty_buffer(bh);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	UFSD(("result %u\n", tmp + blockoff));
+	UFSD("result %u\n", tmp + blockoff);
 out:
 	brelse (bh);
-	UFSD(("EXIT\n"));
+	UFSD("EXIT\n");
 	return result;
 }
 
-/*
- * This function gets the block which contains the fragment.
+/**
+ * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
+ * readpage, writepage and so on
  */
 
-int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
 	struct super_block * sb = inode->i_sb;
 	struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
@@ -387,7 +429,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 	
 	if (!create) {
 		phys64 = ufs_frag_map(inode, fragment);
-		UFSD(("phys64 = %llu \n",phys64));
+		UFSD("phys64 = %llu \n",phys64);
 		if (phys64)
 			map_bh(bh_result, sb, phys64);
 		return 0;
@@ -402,7 +444,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 
 	lock_kernel();
 
-	UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment))
+	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
 	if (fragment < 0)
 		goto abort_negative;
 	if (fragment >
@@ -418,15 +460,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 	 * it much more readable:
 	 */
 #define GET_INODE_DATABLOCK(x) \
-		ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new)
+	ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page)
 #define GET_INODE_PTR(x) \
-		ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL)
+	ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page)
 #define GET_INDIRECT_DATABLOCK(x) \
-		ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
-				  &err, 0, &phys, &new);
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, &phys, &new, bh_result->b_page);
 #define GET_INDIRECT_PTR(x) \
-		ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
-				  &err, 1, NULL, NULL);
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, NULL, NULL, bh_result->b_page);
 
 	if (ptr < UFS_NDIR_FRAGMENT) {
 		bh = GET_INODE_DATABLOCK(ptr);
@@ -474,8 +516,9 @@ abort_too_big:
 	goto abort;
 }
 
-struct buffer_head *ufs_getfrag(struct inode *inode, unsigned int fragment,
-				int create, int *err)
+static struct buffer_head *ufs_getfrag(struct inode *inode,
+				       unsigned int fragment,
+				       int create, int *err)
 {
 	struct buffer_head dummy;
 	int error;
@@ -502,7 +545,7 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
 {
 	struct buffer_head * bh;
 
-	UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment))
+	UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
 	bh = ufs_getfrag (inode, fragment, create, err);
 	if (!bh || buffer_uptodate(bh)) 		
 		return bh;
@@ -540,6 +583,28 @@ struct address_space_operations ufs_aops = {
 	.bmap = ufs_bmap
 };
 
+static void ufs_set_inode_ops(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ufs_file_inode_operations;
+		inode->i_fop = &ufs_file_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ufs_dir_inode_operations;
+		inode->i_fop = &ufs_dir_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (!inode->i_blocks)
+			inode->i_op = &ufs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_mapping->a_ops = &ufs_aops;
+		}
+	} else
+		init_special_inode(inode, inode->i_mode,
+				   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
+}
+
 void ufs_read_inode (struct inode * inode)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
@@ -552,7 +617,7 @@ void ufs_read_inode (struct inode * inode)
 	unsigned i;
 	unsigned flags;
 	
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -603,38 +668,22 @@ void ufs_read_inode (struct inode * inode)
 	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
 	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
 	ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	ufsi->i_dir_start_lookup = 0;
 	
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
 			ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
-	}
-	else {
+	} else {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
 			ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
 	}
 	ufsi->i_osync = 0;
 
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &ufs_file_inode_operations;
-		inode->i_fop = &ufs_file_operations;
-		inode->i_mapping->a_ops = &ufs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &ufs_dir_inode_operations;
-		inode->i_fop = &ufs_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		if (!inode->i_blocks)
-			inode->i_op = &ufs_fast_symlink_inode_operations;
-		else {
-			inode->i_op = &page_symlink_inode_operations;
-			inode->i_mapping->a_ops = &ufs_aops;
-		}
-	} else
-		init_special_inode(inode, inode->i_mode,
-			ufs_get_inode_dev(sb, ufsi));
+	ufs_set_inode_ops(inode);
 
 	brelse (bh);
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 bad_inode:
@@ -642,7 +691,7 @@ bad_inode:
 	return;
 
 ufs2_inode :
-	UFSD(("Reading ufs2 inode, ino %lu\n", inode->i_ino))
+	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
 
 	ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
 
@@ -690,27 +739,11 @@ ufs2_inode :
 	}
 	ufsi->i_osync = 0;
 
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &ufs_file_inode_operations;
-		inode->i_fop = &ufs_file_operations;
-		inode->i_mapping->a_ops = &ufs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &ufs_dir_inode_operations;
-		inode->i_fop = &ufs_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		if (!inode->i_blocks)
-			inode->i_op = &ufs_fast_symlink_inode_operations;
-		else {
-			inode->i_op = &page_symlink_inode_operations;
-			inode->i_mapping->a_ops = &ufs_aops;
-		}
-	} else   /* TODO  : here ...*/
-		init_special_inode(inode, inode->i_mode,
-			ufs_get_inode_dev(sb, ufsi));
+	ufs_set_inode_ops(inode);
 
 	brelse(bh);
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 }
 
@@ -724,7 +757,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 	unsigned i;
 	unsigned flags;
 
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -785,7 +818,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 		sync_dirty_buffer(bh);
 	brelse (bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 0;
 }
 
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 8d5f98a01c74..abd5f23a426d 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -1,6 +1,9 @@
 /*
  * linux/fs/ufs/namei.c
  *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ *
  * Copyright (C) 1998
  * Daniel Pirkl <daniel.pirkl@email.cz>
  * Charles University, Faculty of Mathematics and Physics
@@ -28,21 +31,9 @@
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include "swab.h"	/* will go away - see comment in mknod() */
 #include "util.h"
 
-/*
-#undef UFS_NAMEI_DEBUG
-*/
-#define UFS_NAMEI_DEBUG
-
-#ifdef UFS_NAMEI_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
 	int err = ufs_add_link(dentry, inode);
@@ -88,8 +79,13 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
 static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 		struct nameidata *nd)
 {
-	struct inode * inode = ufs_new_inode(dir, mode);
-	int err = PTR_ERR(inode);
+	struct inode *inode;
+	int err;
+
+	UFSD("BEGIN\n");
+	inode = ufs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ufs_file_inode_operations;
 		inode->i_fop = &ufs_file_operations;
@@ -99,6 +95,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 		err = ufs_add_nondir(dentry, inode);
 		unlock_kernel();
 	}
+	UFSD("END: err=%d\n", err);
 	return err;
 }
 
@@ -205,6 +202,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 
 	inode->i_op = &ufs_dir_inode_operations;
 	inode->i_fop = &ufs_dir_operations;
+	inode->i_mapping->a_ops = &ufs_aops;
 
 	inode_inc_link_count(inode);
 
@@ -231,19 +229,18 @@ out_dir:
 	goto out;
 }
 
-static int ufs_unlink(struct inode * dir, struct dentry *dentry)
+static int ufs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode * inode = dentry->d_inode;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de;
+	struct ufs_dir_entry *de;
+	struct page *page;
 	int err = -ENOENT;
 
-	lock_kernel();
-	de = ufs_find_entry (dentry, &bh);
+	de = ufs_find_entry(dir, dentry, &page);
 	if (!de)
 		goto out;
 
-	err = ufs_delete_entry (dir, de, bh);
+	err = ufs_delete_entry(dir, de, page);
 	if (err)
 		goto out;
 
@@ -251,7 +248,6 @@ static int ufs_unlink(struct inode * dir, struct dentry *dentry)
 	inode_dec_link_count(inode);
 	err = 0;
 out:
-	unlock_kernel();
 	return err;
 }
 
@@ -273,42 +269,42 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
-	struct inode * new_dir,	struct dentry * new_dentry )
+static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
-	struct buffer_head *dir_bh = NULL;
-	struct ufs_dir_entry *dir_de = NULL;
-	struct buffer_head *old_bh;
+	struct page *dir_page = NULL;
+	struct ufs_dir_entry * dir_de = NULL;
+	struct page *old_page;
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
-	lock_kernel();
-	old_de = ufs_find_entry (old_dentry, &old_bh);
+	old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
 	if (!old_de)
 		goto out;
 
 	if (S_ISDIR(old_inode->i_mode)) {
 		err = -EIO;
-		dir_de = ufs_dotdot(old_inode, &dir_bh);
+		dir_de = ufs_dotdot(old_inode, &dir_page);
 		if (!dir_de)
 			goto out_old;
 	}
 
 	if (new_inode) {
-		struct buffer_head *new_bh;
+		struct page *new_page;
 		struct ufs_dir_entry *new_de;
 
 		err = -ENOTEMPTY;
-		if (dir_de && !ufs_empty_dir (new_inode))
+		if (dir_de && !ufs_empty_dir(new_inode))
 			goto out_dir;
+
 		err = -ENOENT;
-		new_de = ufs_find_entry (new_dentry, &new_bh);
+		new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
-		ufs_set_link(new_dir, new_de, new_bh, old_inode);
+		ufs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
 			new_inode->i_nlink--;
@@ -329,24 +325,32 @@ static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
 			inode_inc_link_count(new_dir);
 	}
 
-	ufs_delete_entry (old_dir, old_de, old_bh);
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+	 * inode_dec_link_count() will mark the inode dirty.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
 
+	ufs_delete_entry(old_dir, old_de, old_page);
 	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
-		ufs_set_link(old_inode, dir_de, dir_bh, new_dir);
+		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
 		inode_dec_link_count(old_dir);
 	}
-	unlock_kernel();
 	return 0;
 
+
 out_dir:
-	if (dir_de)
-		brelse(dir_bh);
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
 out_old:
-	brelse (old_bh);
+	kunmap(old_page);
+	page_cache_release(old_page);
 out:
-	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index db98a4c71e63..74ef5e9bedff 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -90,95 +90,84 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_SUPER_DEBUG
-#undef UFS_SUPER_DEBUG_MORE
-
-
-#undef UFS_SUPER_DEBUG_MORE
-#ifdef UFS_SUPER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-#ifdef UFS_SUPER_DEBUG_MORE
+#ifdef CONFIG_UFS_DEBUG
 /*
  * Print contents of ufs_super_block, useful for debugging
  */
-void ufs_print_super_stuff(struct super_block *sb,
-	struct ufs_super_block_first * usb1,
-	struct ufs_super_block_second * usb2, 
-	struct ufs_super_block_third * usb3)
+static void ufs_print_super_stuff(struct super_block *sb, unsigned flags,
+				  struct ufs_super_block_first *usb1,
+				  struct ufs_super_block_second *usb2,
+				  struct ufs_super_block_third *usb3)
 {
 	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
-	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
-	printk("  sblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
-	printk("  cblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
-	printk("  iblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
-	printk("  dblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
-	printk("  cgoffset:      %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset));
-	printk("  ~cgmask:       0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask));
-	printk("  size:          %u\n", fs32_to_cpu(sb, usb1->fs_size));
-	printk("  dsize:         %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
-	printk("  ncg:           %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
-	printk("  bsize:         %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
-	printk("  frag:          %u\n", fs32_to_cpu(sb, usb1->fs_frag));
-	printk("  fragshift:     %u\n", fs32_to_cpu(sb, usb1->fs_fragshift));
-	printk("  ~fmask:        %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
-	printk("  fshift:        %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
-	printk("  sbsize:        %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
-	printk("  spc:           %u\n", fs32_to_cpu(sb, usb1->fs_spc));
-	printk("  cpg:           %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
-	printk("  ipg:           %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
-	printk("  fpg:           %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
-	printk("  csaddr:        %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
-	printk("  cssize:        %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
-	printk("  cgsize:        %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
-	printk("  fstodb:        %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb));
-	printk("  contigsumsize: %d\n", fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize));
-	printk("  postblformat:  %u\n", fs32_to_cpu(sb, usb3->fs_postblformat));
-	printk("  nrpos:         %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
-	printk("  ndir           %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
-	printk("  nifree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
-	printk("  nbfree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
-	printk("  nffree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
-	printk("\n");
-}
-
-/*
- * Print contents of ufs2 ufs_super_block, useful for debugging
- */
-void ufs2_print_super_stuff(
-     struct super_block *sb,
-      struct ufs_super_block *usb)
-{
-	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
-	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
-	printk("  fs_size:   %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
-	printk("  fs_dsize:  %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
-	printk("  bsize:         %u\n", fs32_to_cpu(usb, usb->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(usb, usb->fs_fsize));
-	printk("  fs_volname:  %s\n", usb->fs_u11.fs_u2.fs_volname);
-	printk("  fs_fsmnt:  %s\n", usb->fs_u11.fs_u2.fs_fsmnt);
-	printk("  fs_sblockloc: %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_sblockloc));
-	printk("  cs_ndir(No of dirs):  %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
-	printk("  cs_nbfree(No of free blocks):  %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
+	printk("  magic:     0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		printk("  fs_size:   %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
+		printk("  fs_dsize:  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
+		printk("  bsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_bsize));
+		printk("  fsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsize));
+		printk("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
+		printk("  fs_sblockloc: %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
+		printk("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
+		printk("  cs_nbfree(No of free blocks):  %llu\n",
+		       (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
+	} else {
+		printk(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+		printk(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+		printk(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+		printk(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+		printk(" cgoffset:    %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cgoffset));
+		printk(" ~cgmask:     0x%x\n",
+		       ~fs32_to_cpu(sb, usb1->fs_cgmask));
+		printk(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
+		printk(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+		printk(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+		printk(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+		printk(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+		printk(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+		printk(" fragshift:   %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fragshift));
+		printk(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+		printk(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+		printk(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+		printk(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+		printk(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+		printk(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+		printk(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+		printk(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+		printk(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+		printk(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+		printk(" fstodb:      %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsbtodb));
+		printk(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+		printk(" ndir         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+		printk(" nifree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+		printk(" nbfree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+		printk(" nffree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+	}
 	printk("\n");
 }
 
 /*
  * Print contents of ufs_cylinder_group, useful for debugging
  */
-void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg)
+static void ufs_print_cylinder_stuff(struct super_block *sb,
+				     struct ufs_cylinder_group *cg)
 {
 	printk("\nufs_print_cylinder_stuff\n");
-	printk("size of ucg: %u\n", sizeof(struct ufs_cylinder_group));
+	printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
 	printk("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
 	printk("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
 	printk("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
@@ -202,12 +191,18 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
 	printk("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
 	printk("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
 	printk("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
-	printk("  clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
-	printk("  clusteroff    %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
-	printk("  nclusterblks  %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+	printk("  clustersumoff %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+	printk("  clusteroff    %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+	printk("  nclusterblks  %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
 	printk("\n");
 }
-#endif /* UFS_SUPER_DEBUG_MORE */
+#else
+#  define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/
+#  define ufs_print_cylinder_stuff(sb, cg) /**/
+#endif /* CONFIG_UFS_DEBUG */
 
 static struct super_operations ufs_super_ops;
 
@@ -225,7 +220,7 @@ void ufs_error (struct super_block * sb, const char * function,
 	
 	if (!(sb->s_flags & MS_RDONLY)) {
 		usb1->fs_clean = UFS_FSBAD;
-		ubh_mark_buffer_dirty(USPI_UBH);
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		sb->s_dirt = 1;
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -257,7 +252,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	
 	if (!(sb->s_flags & MS_RDONLY)) {
 		usb1->fs_clean = UFS_FSBAD;
-		ubh_mark_buffer_dirty(USPI_UBH);
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		sb->s_dirt = 1;
 	}
 	va_start (args, fmt);
@@ -309,7 +304,7 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 {
 	char * p;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	if (!options)
 		return 1;
@@ -386,27 +381,57 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 }
 
 /*
+ * Diffrent types of UFS hold fs_cstotal in different
+ * places, and use diffrent data structure for it.
+ * To make things simplier we just copy fs_cstotal to ufs_sb_private_info
+ */
+static void ufs_setup_cstotal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+	unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+
+	UFSD("ENTER, mtype=%u\n", mtype);
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+		uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+	} else {
+		uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
+		uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
+	}
+	UFSD("EXIT\n");
+}
+
+/*
  * Read on-disk structures associated with cylinder groups
  */
-static int ufs_read_cylinder_structures (struct super_block *sb)
+static int ufs_read_cylinder_structures(struct super_block *sb)
 {
-	struct ufs_sb_info * sbi = UFS_SB(sb);
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block *usb;
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	unsigned flags = sbi->s_flags;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned size, blks, i;
-	unsigned flags = 0;
-	
-	UFSD(("ENTER\n"))
-	
-	uspi = sbi->s_uspi;
+	struct ufs_super_block_third *usb3;
 
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data;
+	UFSD("ENTER\n");
 
-        flags = UFS_SB(sb)->s_flags;
-	
+	usb3 = ubh_get_usb_third(uspi);
 	/*
 	 * Read cs structures from (usually) first data block
 	 * on the device. 
@@ -424,7 +449,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 
 		if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 
 			ubh = ubh_bread(sb,
-				fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size);
+				fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size);
 		else 
 			ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
 		
@@ -451,14 +476,13 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
 	}
 	for (i = 0; i < uspi->s_ncg; i++) {
-		UFSD(("read cg %u\n", i))
+		UFSD("read cg %u\n", i);
 		if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
 			goto failed;
 		if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
 			goto failed;
-#ifdef UFS_SUPER_DEBUG_MORE
+
 		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
-#endif
 	}
 	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
 		if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
@@ -466,7 +490,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
 	}
 	sbi->s_cg_loaded = 0;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 1;
 
 failed:
@@ -479,26 +503,69 @@ failed:
 		for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
 			kfree (sbi->s_ucpi[i]);
 	}
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 }
 
 /*
- * Put on-disk structures associated with cylinder groups and 
- * write them back to disk
+ * Sync our internal copy of fs_cstotal with disk
  */
-static void ufs_put_cylinder_structures (struct super_block *sb)
+static void ufs_put_cstotal(struct super_block *sb)
 {
-	struct ufs_sb_info * sbi = UFS_SB(sb);
-	struct ufs_sb_private_info * uspi;
+	unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+
+	UFSD("ENTER\n");
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		usb2->fs_un.fs_u2.cs_ndir =
+			cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+		usb2->fs_un.fs_u2.cs_nbfree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+		usb3->fs_un1.fs_u2.cs_nifree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+		usb3->fs_un1.fs_u2.cs_nffree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
+	} else {
+		usb1->fs_cstotal.cs_ndir =
+			cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+		usb1->fs_cstotal.cs_nbfree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+		usb1->fs_cstotal.cs_nifree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+		usb1->fs_cstotal.cs_nffree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+	}
+	ubh_mark_buffer_dirty(USPI_UBH(uspi));
+	UFSD("EXIT\n");
+}
+
+/**
+ * ufs_put_super_internal() - put on-disk intrenal structures
+ * @sb: pointer to super_block structure
+ * Put on-disk structures associated with cylinder groups
+ * and write them back to disk, also update cs_total on disk
+ */
+static void ufs_put_super_internal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned blks, size, i;
-	
-	UFSD(("ENTER\n"))
-	
-	uspi = sbi->s_uspi;
 
+	
+	UFSD("ENTER\n");
+	ufs_put_cstotal(sb);
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
 	base = space = (char*) sbi->s_csp;
@@ -523,7 +590,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
 		brelse (sbi->s_ucg[i]);
 	kfree (sbi->s_ucg);
 	kfree (base);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 static int ufs_fill_super(struct super_block *sb, void *data, int silent)
@@ -533,7 +600,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block_second * usb2;
 	struct ufs_super_block_third * usb3;
-	struct ufs_super_block *usb;
 	struct ufs_buffer_head * ubh;	
 	struct inode *inode;
 	unsigned block_size, super_block_size;
@@ -544,7 +610,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	ubh = NULL;
 	flags = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 		
 	sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -552,7 +618,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = sbi;
 	memset(sbi, 0, sizeof(struct ufs_sb_info));
 
-	UFSD(("flag %u\n", (int)(sb->s_flags & MS_RDONLY)))
+	UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
 	
 #ifndef CONFIG_UFS_FS_WRITE
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -593,7 +659,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	   the rules */
 	switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
 	case UFS_MOUNT_UFSTYPE_44BSD:
-		UFSD(("ufstype=44bsd\n"))
+		UFSD("ufstype=44bsd\n");
 		uspi->s_fsize = block_size = 512;
 		uspi->s_fmask = ~(512 - 1);
 		uspi->s_fshift = 9;
@@ -602,7 +668,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
 		break;
 	case UFS_MOUNT_UFSTYPE_UFS2:
-		UFSD(("ufstype=ufs2\n"));
+		UFSD("ufstype=ufs2\n");
 		super_block_offset=SBLOCK_UFS2;
 		uspi->s_fsize = block_size = 512;
 		uspi->s_fmask = ~(512 - 1);
@@ -617,7 +683,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 		
 	case UFS_MOUNT_UFSTYPE_SUN:
-		UFSD(("ufstype=sun\n"))
+		UFSD("ufstype=sun\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -628,7 +694,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 
 	case UFS_MOUNT_UFSTYPE_SUNx86:
-		UFSD(("ufstype=sunx86\n"))
+		UFSD("ufstype=sunx86\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -639,7 +705,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 
 	case UFS_MOUNT_UFSTYPE_OLD:
-		UFSD(("ufstype=old\n"))
+		UFSD("ufstype=old\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -654,7 +720,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP:
-		UFSD(("ufstype=nextstep\n"))
+		UFSD("ufstype=nextstep\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -669,7 +735,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
-		UFSD(("ufstype=nextstep-cd\n"))
+		UFSD("ufstype=nextstep-cd\n");
 		uspi->s_fsize = block_size = 2048;
 		uspi->s_fmask = ~(2048 - 1);
 		uspi->s_fshift = 11;
@@ -684,7 +750,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_OPENSTEP:
-		UFSD(("ufstype=openstep\n"))
+		UFSD("ufstype=openstep\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -699,7 +765,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_HP:
-		UFSD(("ufstype=hp\n"))
+		UFSD("ufstype=hp\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -737,8 +803,6 @@ again:
 	usb1 = ubh_get_usb_first(uspi);
 	usb2 = ubh_get_usb_second(uspi);
 	usb3 = ubh_get_usb_third(uspi);
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
 
 	/*
 	 * Check ufs magic number
@@ -820,16 +884,12 @@ magic_found:
 		ubh = NULL;
 		block_size = uspi->s_fsize;
 		super_block_size = uspi->s_sbsize;
-		UFSD(("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size))
+		UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
 		goto again;
 	}
 
-#ifdef UFS_SUPER_DEBUG_MORE
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-		ufs2_print_super_stuff(sb,usb);
-        else
-		ufs_print_super_stuff(sb, usb1, usb2, usb3);
-#endif
+
+	ufs_print_super_stuff(sb, flags, usb1, usb2, usb3);
 
 	/*
 	 * Check, if file system was correctly unmounted.
@@ -842,13 +902,13 @@ magic_found:
 	  (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
 		switch(usb1->fs_clean) {
 		case UFS_FSCLEAN:
-			UFSD(("fs is clean\n"))
+			UFSD("fs is clean\n");
 			break;
 		case UFS_FSSTABLE:
-			UFSD(("fs is stable\n"))
+			UFSD("fs is stable\n");
 			break;
 		case UFS_FSOSF1:
-			UFSD(("fs is DEC OSF/1\n"))
+			UFSD("fs is DEC OSF/1\n");
 			break;
 		case UFS_FSACTIVE:
 			printk("ufs_read_super: fs is active\n");
@@ -863,8 +923,7 @@ magic_found:
 			sb->s_flags |= MS_RDONLY;
 			break;
 		}
-	}
-	else {
+	} else {
 		printk("ufs_read_super: fs needs fsck\n");
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -884,10 +943,9 @@ magic_found:
 	uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
 
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
-		uspi->s_u2_size  = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size);
-		uspi->s_u2_dsize = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
-	}
-	else {
+		uspi->s_u2_size  = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
+		uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
 		uspi->s_size  =  fs32_to_cpu(sb, usb1->fs_size);
 		uspi->s_dsize =  fs32_to_cpu(sb, usb1->fs_dsize);
 	}
@@ -901,8 +959,8 @@ magic_found:
 	uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
 	uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
 	uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
-	UFSD(("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
-		uspi->s_fshift));
+	UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
+		uspi->s_fshift);
 	uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
 	uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
 	/* s_sbsize already set */
@@ -922,8 +980,8 @@ magic_found:
 	uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
 	uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
 	uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
-	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_cpc);
-	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize);
+	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
+	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
 	uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
 	uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
 	uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
@@ -935,12 +993,11 @@ magic_found:
 	 * Compute another frequently used values
 	 */
 	uspi->s_fpbmask = uspi->s_fpb - 1;
-	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
 		uspi->s_apbshift = uspi->s_bshift - 3;
-	}
-	else {
+	else
 		uspi->s_apbshift = uspi->s_bshift - 2;
-	}
+
 	uspi->s_2apbshift = uspi->s_apbshift * 2;
 	uspi->s_3apbshift = uspi->s_apbshift * 3;
 	uspi->s_apb = 1 << uspi->s_apbshift;
@@ -956,7 +1013,7 @@ magic_found:
 	if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) ==
 	    UFS_MOUNT_UFSTYPE_44BSD)
 		uspi->s_maxsymlinklen =
-		    fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_maxsymlinklen);
+		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
 	
 	sbi->s_flags = flags;
 
@@ -967,7 +1024,7 @@ magic_found:
 	if (!sb->s_root)
 		goto dalloc_failed;
 
-
+	ufs_setup_cstotal(sb);
 	/*
 	 * Read cylinder group structures
 	 */
@@ -975,7 +1032,7 @@ magic_found:
 		if (!ufs_read_cylinder_structures(sb))
 			goto failed;
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 0;
 
 dalloc_failed:
@@ -986,15 +1043,16 @@ failed:
 	kfree (uspi);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return -EINVAL;
 
 failed_nomem:
-	UFSD(("EXIT (NOMEM)\n"))
+	UFSD("EXIT (NOMEM)\n");
 	return -ENOMEM;
 }
 
-static void ufs_write_super (struct super_block *sb) {
+static void ufs_write_super(struct super_block *sb)
+{
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block_third * usb3;
@@ -1002,7 +1060,7 @@ static void ufs_write_super (struct super_block *sb) {
 
 	lock_kernel();
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	flags = UFS_SB(sb)->s_flags;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
@@ -1014,26 +1072,27 @@ static void ufs_write_super (struct super_block *sb) {
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
 			ufs_set_fs_state(sb, usb1, usb3,
 					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH);
+		ufs_put_cstotal(sb);
 	}
 	sb->s_dirt = 0;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	unlock_kernel();
 }
 
-static void ufs_put_super (struct super_block *sb)
+static void ufs_put_super(struct super_block *sb)
 {
 	struct ufs_sb_info * sbi = UFS_SB(sb);
 		
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	if (!(sb->s_flags & MS_RDONLY))
-		ufs_put_cylinder_structures (sb);
+		ufs_put_super_internal(sb);
 	
 	ubh_brelse_uspi (sbi->s_uspi);
 	kfree (sbi->s_uspi);
 	kfree (sbi);
 	sb->s_fs_info = NULL;
+	UFSD("EXIT\n");
 	return;
 }
 
@@ -1062,8 +1121,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		return -EINVAL;
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
 		new_mount_opt |= ufstype;
-	}
-	else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
+	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
 		return -EINVAL;
 	}
@@ -1077,20 +1135,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	 * fs was mouted as rw, remounting ro
 	 */
 	if (*mount_flags & MS_RDONLY) {
-		ufs_put_cylinder_structures(sb);
+		ufs_put_super_internal(sb);
 		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
 		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
 			ufs_set_fs_state(sb, usb1, usb3,
 				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH);
+		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 		sb->s_dirt = 0;
 		sb->s_flags |= MS_RDONLY;
-	}
+	} else {
 	/*
 	 * fs was mounted as ro, remounting rw
 	 */
-	else {
 #ifndef CONFIG_UFS_FS_WRITE
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
@@ -1102,7 +1159,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			printk("this ufstype is read-only supported\n");
 			return -EINVAL;
 		}
-		if (!ufs_read_cylinder_structures (sb)) {
+		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
 			return -EPERM;
 		}
@@ -1113,36 +1170,31 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block_first * usb1;
-	struct ufs_super_block * usb;
-	unsigned  flags = 0;
+	struct super_block *sb = dentry->d_sb;
+	struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
+	unsigned  flags = UFS_SB(sb)->s_flags;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
 
 	lock_kernel();
 
-	uspi = UFS_SB(sb)->s_uspi;
-	usb1 = ubh_get_usb_first (uspi);
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
 	
-	flags = UFS_SB(sb)->s_flags;
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
 		buf->f_type = UFS2_MAGIC;
-		buf->f_blocks = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
-		buf->f_bfree = ufs_blkstofrags(fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)) +
-			fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nffree);
-		buf->f_ffree = fs64_to_cpu(sb,
-        		usb->fs_u11.fs_u2.fs_cstotal.cs_nifree);
-	}
-	else {
+		buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
 		buf->f_type = UFS_MAGIC;
 		buf->f_blocks = uspi->s_dsize;
-		buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
-			fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
-		buf->f_ffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
 	}
+	buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree;
+	buf->f_ffree = uspi->cs_total.cs_nifree;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
 		? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
@@ -1311,10 +1363,10 @@ out:
 
 #endif
 
-static struct super_block *ufs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ufs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
 }
 
 static struct file_system_type ufs_fs_type = {
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 02e86291ef8a..3c3b301f8701 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -49,14 +49,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_TRUNCATE_DEBUG
-
-#ifdef UFS_TRUNCATE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
- 
 /*
  * Secure deletion currently doesn't work. It interacts very badly
  * with buffers shared with memory mappings, and for that reason
@@ -82,7 +74,7 @@ static int ufs_trunc_direct (struct inode * inode)
 	unsigned i, tmp;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -105,7 +97,7 @@ static int ufs_trunc_direct (struct inode * inode)
 		block2 = ufs_fragstoblks (frag3);
 	}
 
-	UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4))
+	UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4);
 
 	if (frag1 >= frag2)
 		goto next1;		
@@ -120,9 +112,8 @@ static int ufs_trunc_direct (struct inode * inode)
 	frag1 = ufs_fragnum (frag1);
 	frag2 = ufs_fragnum (frag2);
 
-	inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift;
-	mark_inode_dirty(inode);
 	ufs_free_fragments (inode, tmp + frag1, frag2 - frag1);
+	mark_inode_dirty(inode);
 	frag_to_free = tmp + frag1;
 
 next1:
@@ -136,8 +127,7 @@ next1:
 			continue;
 
 		*p = 0;
-		inode->i_blocks -= uspi->s_nspb;
-		mark_inode_dirty(inode);
+
 		if (free_count == 0) {
 			frag_to_free = tmp;
 			free_count = uspi->s_fpb;
@@ -148,6 +138,7 @@ next1:
 			frag_to_free = tmp;
 			free_count = uspi->s_fpb;
 		}
+		mark_inode_dirty(inode);
 	}
 	
 	if (free_count > 0)
@@ -166,12 +157,12 @@ next1:
 	frag4 = ufs_fragnum (frag4);
 
 	*p = 0;
-	inode->i_blocks -= frag4 << uspi->s_nspfshift;
-	mark_inode_dirty(inode);
+
 	ufs_free_fragments (inode, tmp, frag4);
+	mark_inode_dirty(inode);
  next3:
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return retry;
 }
 
@@ -186,7 +177,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 	unsigned frag_to_free, free_count;
 	int retry;
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 		
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -227,7 +218,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 			frag_to_free = tmp;
 			free_count = uspi->s_fpb;
 		}
-		inode->i_blocks -= uspi->s_nspb;
+
 		mark_inode_dirty(inode);
 	}
 
@@ -238,26 +229,21 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 		if (*ubh_get_addr32(ind_ubh,i))
 			break;
 	if (i >= uspi->s_apb) {
-		if (ubh_max_bcount(ind_ubh) != 1) {
-			retry = 1;
-		}
-		else {
-			tmp = fs32_to_cpu(sb, *p);
-			*p = 0;
-			inode->i_blocks -= uspi->s_nspb;
-			mark_inode_dirty(inode);
-			ufs_free_blocks (inode, tmp, uspi->s_fpb);
-			ubh_bforget(ind_ubh);
-			ind_ubh = NULL;
-		}
+		tmp = fs32_to_cpu(sb, *p);
+		*p = 0;
+
+		ufs_free_blocks (inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(ind_ubh);
+		ind_ubh = NULL;
 	}
 	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-		ubh_ll_rw_block (SWRITE, 1, &ind_ubh);
+		ubh_ll_rw_block(SWRITE, ind_ubh);
 		ubh_wait_on_buffer (ind_ubh);
 	}
 	ubh_brelse (ind_ubh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	
 	return retry;
 }
@@ -271,7 +257,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 	__fs32 * dind;
 	int retry = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -306,25 +292,21 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 		if (*ubh_get_addr32 (dind_bh, i))
 			break;
 	if (i >= uspi->s_apb) {
-		if (ubh_max_bcount(dind_bh) != 1)
-			retry = 1;
-		else {
-			tmp = fs32_to_cpu(sb, *p);
-			*p = 0;
-			inode->i_blocks -= uspi->s_nspb;
-			mark_inode_dirty(inode);
-			ufs_free_blocks (inode, tmp, uspi->s_fpb);
-			ubh_bforget(dind_bh);
-			dind_bh = NULL;
-		}
+		tmp = fs32_to_cpu(sb, *p);
+		*p = 0;
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(dind_bh);
+		dind_bh = NULL;
 	}
 	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-		ubh_ll_rw_block (SWRITE, 1, &dind_bh);
+		ubh_ll_rw_block(SWRITE, dind_bh);
 		ubh_wait_on_buffer (dind_bh);
 	}
 	ubh_brelse (dind_bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	
 	return retry;
 }
@@ -339,7 +321,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
 	__fs32 * tind, * p;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -370,25 +352,21 @@ static int ufs_trunc_tindirect (struct inode * inode)
 		if (*ubh_get_addr32 (tind_bh, i))
 			break;
 	if (i >= uspi->s_apb) {
-		if (ubh_max_bcount(tind_bh) != 1)
-			retry = 1;
-		else {
-			tmp = fs32_to_cpu(sb, *p);
-			*p = 0;
-			inode->i_blocks -= uspi->s_nspb;
-			mark_inode_dirty(inode);
-			ufs_free_blocks (inode, tmp, uspi->s_fpb);
-			ubh_bforget(tind_bh);
-			tind_bh = NULL;
-		}
+		tmp = fs32_to_cpu(sb, *p);
+		*p = 0;
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
+		ubh_bforget(tind_bh);
+		tind_bh = NULL;
 	}
 	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-		ubh_ll_rw_block (SWRITE, 1, &tind_bh);
+		ubh_ll_rw_block(SWRITE, tind_bh);
 		ubh_wait_on_buffer (tind_bh);
 	}
 	ubh_brelse (tind_bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return retry;
 }
 		
@@ -399,7 +377,7 @@ void ufs_truncate (struct inode * inode)
 	struct ufs_sb_private_info * uspi;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
 
@@ -430,5 +408,5 @@ void ufs_truncate (struct inode * inode)
 	ufsi->i_lastfrag = DIRECT_FRAGMENT;
 	unlock_kernel();
 	mark_inode_dirty(inode);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 59acc8f073ac..a2f13f45708b 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -14,15 +14,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_UTILS_DEBUG
-
-#ifdef UFS_UTILS_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-
 struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
 	struct super_block *sb, u64 fragment, u64 size)
 {
@@ -63,17 +54,17 @@ struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
 	count = size >> uspi->s_fshift;
 	if (count <= 0 || count > UFS_MAXFRAG)
 		return NULL;
-	USPI_UBH->fragment = fragment;
-	USPI_UBH->count = count;
+	USPI_UBH(uspi)->fragment = fragment;
+	USPI_UBH(uspi)->count = count;
 	for (i = 0; i < count; i++)
-		if (!(USPI_UBH->bh[i] = sb_bread(sb, fragment + i)))
+		if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
 			goto failed;
 	for (; i < UFS_MAXFRAG; i++)
-		USPI_UBH->bh[i] = NULL;
-	return USPI_UBH;
+		USPI_UBH(uspi)->bh[i] = NULL;
+	return USPI_UBH(uspi);
 failed:
 	for (j = 0; j < i; j++)
-		brelse (USPI_UBH->bh[j]);
+		brelse (USPI_UBH(uspi)->bh[j]);
 	return NULL;
 }
 
@@ -90,11 +81,11 @@ void ubh_brelse (struct ufs_buffer_head * ubh)
 void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
 {
 	unsigned i;
-	if (!USPI_UBH)
+	if (!USPI_UBH(uspi))
 		return;
-	for ( i = 0; i < USPI_UBH->count; i++ ) {
-		brelse (USPI_UBH->bh[i]);
-		USPI_UBH->bh[i] = NULL;
+	for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
+		brelse (USPI_UBH(uspi)->bh[i]);
+		USPI_UBH(uspi)->bh[i] = NULL;
 	}
 }
 
@@ -121,13 +112,12 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
 	}
 }
 
-void ubh_ll_rw_block (int rw, unsigned nr, struct ufs_buffer_head * ubh[])
+void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
 {
-	unsigned i;
 	if (!ubh)
 		return;
-	for ( i = 0; i < nr; i++ )
-		ll_rw_block (rw, ubh[i]->count, ubh[i]->bh);
+
+	ll_rw_block(rw, ubh->count, ubh->bh);
 }
 
 void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
@@ -139,18 +129,6 @@ void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
 		wait_on_buffer (ubh->bh[i]);
 }
 
-unsigned ubh_max_bcount (struct ufs_buffer_head * ubh)
-{
-	unsigned i;
-	unsigned max = 0;
-	if (!ubh)
-		return 0;
-	for ( i = 0; i < ubh->count; i++ ) 
-		if ( atomic_read(&ubh->bh[i]->b_count) > max )
-			max = atomic_read(&ubh->bh[i]->b_count);
-	return max;
-}
-
 void ubh_bforget (struct ufs_buffer_head * ubh)
 {
 	unsigned i;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 48d6d9bcc157..406981fff5e7 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -17,10 +17,16 @@
 #define in_range(b,first,len)	((b)>=(first)&&(b)<(first)+(len))
 
 /*
- * macros used for retyping
+ * functions used for retyping
  */
-#define UCPI_UBH ((struct ufs_buffer_head *)ucpi)
-#define USPI_UBH ((struct ufs_buffer_head *)uspi)
+static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
+{
+	return &cpi->c_ubh;
+}
+static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
+{
+	return &spi->s_ubh;
+}
 
 
 
@@ -33,12 +39,12 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_sun.fs_state);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
 	case UFS_ST_SUNx86:
 		return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
 	case UFS_ST_44BSD:
 	default:
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_state);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
 	}
 }
 
@@ -48,13 +54,13 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		usb3->fs_u2.fs_sun.fs_state = cpu_to_fs32(sb, value);
+		usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
 		break;
 	case UFS_ST_SUNx86:
 		usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
 		break;
 	case UFS_ST_44BSD:
-		usb3->fs_u2.fs_44.fs_state = cpu_to_fs32(sb, value);
+		usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
 		break;
 	}
 }
@@ -64,7 +70,7 @@ ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
 		  struct ufs_super_block_third *usb3)
 {
 	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_sunx86.fs_npsect);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
 	else
 		return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
 }
@@ -76,16 +82,16 @@ ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
 
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
 		break;
 	case UFS_ST_SUNx86:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
 		break;
 	case UFS_ST_44BSD:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
 		break;
 	}
 
@@ -99,16 +105,16 @@ ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
 
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
 		break;
 	case UFS_ST_SUNx86:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
 		break;
 	case UFS_ST_44BSD:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
 		break;
 	}
 
@@ -236,9 +242,8 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block (int, unsigned, struct ufs_buffer_head **);
+extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
 extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
-extern unsigned ubh_max_bcount (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
@@ -297,40 +302,26 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
 #define ubh_blkmap(ubh,begin,bit) \
 	((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
 
-
-/*
- * Macros for access to superblock array structures
- */
-#define ubh_postbl(ubh,cylno,i) \
-	((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-	? (*(__s16*)(ubh_get_addr(ubh, \
-	(unsigned)(&((struct ufs_super_block *)0)->fs_opostbl) \
-	+ (((cylno) * 16 + (i)) << 1) ) )) \
-	: (*(__s16*)(ubh_get_addr(ubh, \
-	uspi->s_postbloff + (((cylno) * uspi->s_nrpos + (i)) << 1) ))))
-
-#define ubh_rotbl(ubh,i) \
-	((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-	? (*(__u8*)(ubh_get_addr(ubh, \
-	(unsigned)(&((struct ufs_super_block *)0)->fs_space) + (i)))) \
-	: (*(__u8*)(ubh_get_addr(ubh, uspi->s_rotbloff + (i)))))
-
 /*
  * Determine the number of available frags given a
  * percentage to hold in reserve.
  */
-#define ufs_freespace(usb, percentreserved) \
-	(ufs_blkstofrags(fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nbfree)) + \
-	fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nffree) - (uspi->s_dsize * (percentreserved) / 100))
+static inline u64
+ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
+{
+	return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree -
+		(uspi->s_dsize * (percentreserved) / 100);
+}
 
 /*
  * Macros to access cylinder group array structures
  */
 #define ubh_cg_blktot(ucpi,cylno) \
-	(*((__fs32*)ubh_get_addr(UCPI_UBH, (ucpi)->c_btotoff + ((cylno) << 2))))
+	(*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
 
 #define ubh_cg_blks(ucpi,cylno,rpos) \
-	(*((__fs16*)ubh_get_addr(UCPI_UBH, \
+	(*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
 	(ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
 
 /*
@@ -508,29 +499,3 @@ static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap,
 	if (fragsize > 0 && fragsize < uspi->s_fpb)
 		fs32_add(sb, &fraglist[fragsize], cnt);
 }
-
-#define ubh_scanc(ubh,begin,size,table,mask) _ubh_scanc_(uspi,ubh,begin,size,table,mask)
-static inline unsigned _ubh_scanc_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, 
-	unsigned begin, unsigned size, unsigned char * table, unsigned char mask)
-{
-	unsigned rest, offset;
-	unsigned char * cp;
-	
-
-	offset = begin & ~uspi->s_fmask;
-	begin >>= uspi->s_fshift;
-	for (;;) {
-		if ((offset + size) < uspi->s_fsize)
-			rest = size;
-		else
-			rest = uspi->s_fsize - offset;
-		size -= rest;
-		cp = ubh->bh[begin]->b_data + offset;
-		while ((table[*cp++] & mask) == 0 && --rest);
-		if (rest || !size)
-			break;
-		begin++;
-		offset = 0;
-	}
-	return (size + rest);
-}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a56cec3be5f0..9a8f48bae956 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -1023,11 +1023,12 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *vfat_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *data)
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vfat_fs_type = {
diff --git a/fs/xattr.c b/fs/xattr.c
index e416190f5e9c..c32f15b5f60f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -242,7 +242,7 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
 	if (!f)
 		return error;
 	dentry = f->f_dentry;
-	audit_inode(NULL, dentry->d_inode, 0);
+	audit_inode(NULL, dentry->d_inode);
 	error = setxattr(dentry, name, value, size, flags);
 	fput(f);
 	return error;
@@ -469,7 +469,7 @@ sys_fremovexattr(int fd, char __user *name)
 	if (!f)
 		return error;
 	dentry = f->f_dentry;
-	audit_inode(NULL, dentry->d_inode, 0);
+	audit_inode(NULL, dentry->d_inode);
 	error = removexattr(dentry, name);
 	fput(f);
 	return error;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index bac27d66151d..26b364c9d62c 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,5 @@
 config XFS_FS
 	tristate "XFS filesystem support"
-	select EXPORTFS if NFSD!=n
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
@@ -18,11 +17,6 @@ config XFS_FS
 	  system of your root partition is compiled as a module, you'll need
 	  to use an initial ramdisk (initrd) to boot.
 
-config XFS_EXPORT
-	bool
-	depends on XFS_FS && EXPORTFS
-	default y
-
 config XFS_QUOTA
 	bool "XFS Quota support"
 	depends on XFS_FS
@@ -65,18 +59,19 @@ config XFS_POSIX_ACL
 	  If you don't know what Access Control Lists are, say N.
 
 config XFS_RT
-	bool "XFS Realtime support (EXPERIMENTAL)"
-	depends on XFS_FS && EXPERIMENTAL
+	bool "XFS Realtime subvolume support"
+	depends on XFS_FS
 	help
 	  If you say Y here you will be able to mount and use XFS filesystems
-	  which contain a realtime subvolume. The realtime subvolume is a
-	  separate area of disk space where only file data is stored. The
-	  realtime subvolume is designed to provide very deterministic
-	  data rates suitable for media streaming applications.
-
-	  See the xfs man page in section 5 for a bit more information.
+	  which contain a realtime subvolume.  The realtime subvolume is a
+	  separate area of disk space where only file data is stored.  It was
+	  originally designed to provide deterministic data rates suitable
+	  for media streaming applications, but is also useful as a generic
+	  mechanism for ensuring data and metadata/log I/Os are completely
+	  separated.  Regular file I/Os are isolated to a separate device
+	  from all other requests, and this can be done quite transparently
+	  to applications via the inherit-realtime directory inode flag.
 
-	  This feature is unsupported at this time, is not yet fully
-	  functional, and may cause serious problems.
+	  See the xfs man page in section 5 for additional information.
 
 	  If unsure, say N.
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 5d73eaa1971f..9e7f85986d0d 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -59,7 +59,6 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= $(XFS_LINUX)/xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= $(XFS_LINUX)/xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= $(XFS_LINUX)/xfs_ioctl32.o
-xfs-$(CONFIG_XFS_EXPORT)	+= $(XFS_LINUX)/xfs_export.o
 
 
 xfs-y				+= xfs_alloc.o \
@@ -73,14 +72,12 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_btree.o \
 				   xfs_buf_item.o \
 				   xfs_da_btree.o \
-				   xfs_dir.o \
 				   xfs_dir2.o \
 				   xfs_dir2_block.o \
 				   xfs_dir2_data.o \
 				   xfs_dir2_leaf.o \
 				   xfs_dir2_node.o \
 				   xfs_dir2_sf.o \
-				   xfs_dir_leaf.o \
 				   xfs_error.o \
 				   xfs_extfree_item.o \
 				   xfs_fsops.o \
@@ -117,6 +114,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   kmem.o \
 				   xfs_aops.o \
 				   xfs_buf.o \
+				   xfs_export.o \
 				   xfs_file.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 2cfd33d4d8aa..939bd84bc7ee 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -23,42 +23,6 @@
 #include <linux/mm.h>
 
 /*
- * Process flags handling
- */
-
-#define PFLAGS_TEST_NOIO()              (current->flags & PF_NOIO)
-#define PFLAGS_TEST_FSTRANS()           (current->flags & PF_FSTRANS)
-
-#define PFLAGS_SET_NOIO() do {		\
-	current->flags |= PF_NOIO;	\
-} while (0)
-
-#define PFLAGS_CLEAR_NOIO() do {	\
-	current->flags &= ~PF_NOIO;	\
-} while (0)
-
-/* these could be nested, so we save state */
-#define PFLAGS_SET_FSTRANS(STATEP) do {	\
-	*(STATEP) = current->flags;	\
-	current->flags |= PF_FSTRANS;	\
-} while (0)
-
-#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \
-	*(STATEP) = current->flags;	\
-	current->flags &= ~PF_FSTRANS;	\
-} while (0)
-
-/* Restore the PF_FSTRANS state to what was saved in STATEP */
-#define PFLAGS_RESTORE_FSTRANS(STATEP) do {     		\
-	current->flags = ((current->flags & ~PF_FSTRANS) |	\
-			  (*(STATEP) & PF_FSTRANS));		\
-} while (0)
-
-#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
-	*(NSTATEP) = *(OSTATEP);	\
-} while (0)
-
-/*
  * General memory allocation interfaces
  */
 
@@ -83,7 +47,7 @@ kmem_flags_convert(unsigned int __nocast flags)
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
 	} else {
 		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS))
+		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
 			lflags &= ~__GFP_FS;
 	}
 	return lflags;
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
index 1b262b790d9c..32e1ce0f04c9 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -28,7 +28,7 @@ typedef struct {
 } mrlock_t;
 
 #define mrinit(mrp, name)	\
-	( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
 #define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
 #define mrfree(mrp)		do { } while (0)
 #define mraccess(mrp)		mraccessf(mrp, 0)
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 194a84490bd1..b25090094cca 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -34,20 +34,21 @@ typedef struct semaphore sema_t;
 #define initnsema(sp, val, name)	sema_init(sp, val)
 #define psema(sp, b)			down(sp)
 #define vsema(sp)			up(sp)
-#define valusema(sp)			(atomic_read(&(sp)->count))
-#define freesema(sema)
+#define freesema(sema)			do { } while (0)
+
+static inline int issemalocked(sema_t *sp)
+{
+	return down_trylock(sp) || (up(sp), 0);
+}
 
 /*
  * Map cpsema (try to get the sema) to down_trylock. We need to switch
  * the return values since cpsema returns 1 (acquired) 0 (failed) and
  * down_trylock returns the reverse 0 (acquired) 1 (failed).
  */
-
-#define cpsema(sp)			(down_trylock(sp) ? 0 : 1)
-
-/*
- * Didn't do cvsema(sp). Not sure how to map this to up/down/...
- * It does a vsema if the values is < 0 other wise nothing.
- */
+static inline int cpsema(sema_t *sp)
+{
+	return down_trylock(sp) ? 0 : 1;
+}
 
 #endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 4d191ef39b67..3e807b828e22 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,7 +21,6 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_trans.h"
 #include "xfs_dmapi.h"
@@ -29,7 +28,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -76,7 +74,7 @@ xfs_page_trace(
 	int		mask)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	loff_t		isize = i_size_read(inode);
 	loff_t		offset = page_offset(page);
 	int		delalloc = -1, unmapped = -1, unwritten = -1;
@@ -136,9 +134,10 @@ xfs_destroy_ioend(
 
 	for (bh = ioend->io_buffer_head; bh; bh = next) {
 		next = bh->b_private;
-		bh->b_end_io(bh, ioend->io_uptodate);
+		bh->b_end_io(bh, !ioend->io_error);
 	}
-
+	if (unlikely(ioend->io_error))
+		vn_ioerror(ioend->io_vnode, ioend->io_error, __FILE__,__LINE__);
 	vn_iowake(ioend->io_vnode);
 	mempool_free(ioend, xfs_ioend_pool);
 }
@@ -180,13 +179,12 @@ xfs_end_bio_unwritten(
 	void			*data)
 {
 	xfs_ioend_t		*ioend = data;
-	vnode_t			*vp = ioend->io_vnode;
+	bhv_vnode_t		*vp = ioend->io_vnode;
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
-	int			error;
 
-	if (ioend->io_uptodate)
-		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+	if (likely(!ioend->io_error))
+		bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -211,7 +209,7 @@ xfs_alloc_ioend(
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
-	ioend->io_uptodate = 1; /* cleared if any I/O fails */
+	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
 	ioend->io_vnode = vn_from_inode(inode);
@@ -239,10 +237,10 @@ xfs_map_blocks(
 	xfs_iomap_t		*mapp,
 	int			flags)
 {
-	vnode_t			*vp = vn_from_inode(inode);
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 	int			error, nmaps = 1;
 
-	VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
+	error = bhv_vop_bmap(vp, offset, count, flags, mapp, &nmaps);
 	if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
 		VMODIFY(vp);
 	return -error;
@@ -271,16 +269,14 @@ xfs_end_bio(
 	if (bio->bi_size)
 		return 1;
 
-	ASSERT(ioend);
 	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 
 	/* Toss bio and pass work off to an xfsdatad thread */
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		ioend->io_uptodate = 0;
 	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
-
 	bio_put(bio);
+
 	xfs_finish_ioend(ioend);
 	return 0;
 }
@@ -1127,7 +1123,7 @@ xfs_vm_writepage(
 	 * then mark the page dirty again and leave the page
 	 * as is.
 	 */
-	if (PFLAGS_TEST_FSTRANS() && need_trans)
+	if (current_test_flags(PF_FSTRANS) && need_trans)
 		goto out_fail;
 
 	/*
@@ -1158,6 +1154,18 @@ out_unlock:
 	return error;
 }
 
+STATIC int
+xfs_vm_writepages(
+	struct address_space	*mapping,
+	struct writeback_control *wbc)
+{
+	struct bhv_vnode	*vp = vn_from_inode(mapping->host);
+
+	if (VN_TRUNC(vp))
+		VUNTRUNCATE(vp);
+	return generic_writepages(mapping, wbc);
+}
+
 /*
  * Called to move a page into cleanable state - and from there
  * to be released. Possibly the page is already clean. We always
@@ -1204,7 +1212,7 @@ xfs_vm_releasepage(
 	/* If we are already inside a transaction or the thread cannot
 	 * do I/O, we cannot release this page.
 	 */
-	if (PFLAGS_TEST_FSTRANS())
+	if (current_test_flags(PF_FSTRANS))
 		return 0;
 
 	/*
@@ -1231,7 +1239,7 @@ __xfs_get_blocks(
 	int			direct,
 	bmapi_flags_t		flags)
 {
-	vnode_t			*vp = vn_from_inode(inode);
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 	xfs_iomap_t		iomap;
 	xfs_off_t		offset;
 	ssize_t			size;
@@ -1241,8 +1249,8 @@ __xfs_get_blocks(
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
-	VOP_BMAP(vp, offset, size,
-		create ? flags : BMAPI_READ, &iomap, &niomap, error);
+	error = bhv_vop_bmap(vp, offset, size,
+			     create ? flags : BMAPI_READ, &iomap, &niomap);
 	if (error)
 		return -error;
 	if (niomap == 0)
@@ -1370,13 +1378,13 @@ xfs_vm_direct_IO(
 {
 	struct file	*file = iocb->ki_filp;
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	xfs_iomap_t	iomap;
 	int		maps = 1;
 	int		error;
 	ssize_t		ret;
 
-	VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
+	error = bhv_vop_bmap(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps);
 	if (error)
 		return -error;
 
@@ -1409,14 +1417,12 @@ xfs_vm_bmap(
 	sector_t		block)
 {
 	struct inode		*inode = (struct inode *)mapping->host;
-	vnode_t			*vp = vn_from_inode(inode);
-	int			error;
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
-
-	VOP_RWLOCK(vp, VRWLOCK_READ);
-	VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
-	VOP_RWUNLOCK(vp, VRWLOCK_READ);
+	bhv_vop_rwlock(vp, VRWLOCK_READ);
+	bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
+	bhv_vop_rwunlock(vp, VRWLOCK_READ);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 
@@ -1452,6 +1458,7 @@ struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
+	.writepages		= xfs_vm_writepages,
 	.sync_page		= block_sync_page,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 60716543c68b..706d8c781b8a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 Silicon Graphics, Inc.
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -30,9 +30,9 @@ typedef void (*xfs_ioend_func_t)(void *);
 typedef struct xfs_ioend {
 	struct xfs_ioend	*io_list;	/* next ioend in chain */
 	unsigned int		io_type;	/* delalloc / unwritten */
-	unsigned int		io_uptodate;	/* I/O status register */
+	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
-	struct vnode		*io_vnode;	/* file being written to */
+	struct bhv_vnode	*io_vnode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
 	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
 	size_t			io_size;	/* size of the extent */
@@ -43,4 +43,4 @@ typedef struct xfs_ioend {
 extern struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
-#endif /* __XFS_IOPS_H__ */
+#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index b768ea910bbe..5fb75d9151f2 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -21,7 +21,6 @@
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
 
@@ -97,7 +96,7 @@ xfs_fs_encode_fh(
 	int			len;
 	int			is64 = 0;
 #if XFS_BIG_INUMS
-	vfs_t			*vfs = vfs_from_sb(inode->i_sb);
+	bhv_vfs_t		*vfs = vfs_from_sb(inode->i_sb);
 
 	if (!(vfs->vfs_flag & VFS_32BITINODES)) {
 		/* filesystem may contain 64bit inode numbers */
@@ -136,13 +135,13 @@ xfs_fs_get_dentry(
 	struct super_block	*sb,
 	void			*data)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	struct inode		*inode;
 	struct dentry		*result;
-	vfs_t			*vfsp = vfs_from_sb(sb);
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
 	int			error;
 
-	VFS_VGET(vfsp, &vp, (fid_t *)data, error);
+	error = bhv_vfs_vget(vfsp, &vp, (fid_t *)data);
 	if (error || vp == NULL)
 		return ERR_PTR(-ESTALE) ;
 
@@ -160,12 +159,12 @@ xfs_fs_get_parent(
 	struct dentry		*child)
 {
 	int			error;
-	vnode_t			*vp, *cvp;
+	bhv_vnode_t		*vp, *cvp;
 	struct dentry		*parent;
 
 	cvp = NULL;
 	vp = vn_from_inode(child->d_inode);
-	VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
+	error = bhv_vop_lookup(vp, &dotdot, &cvp, 0, NULL, NULL);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index c847416f6d10..3d4f6dff2113 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -21,7 +21,6 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_trans.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -58,15 +56,12 @@ __xfs_file_read(
 {
 	struct iovec		iov = {buf, count};
 	struct file		*file = iocb->ki_filp;
-	vnode_t			*vp = vn_from_inode(file->f_dentry->d_inode);
-	ssize_t			rval;
+	bhv_vnode_t		*vp = vn_from_inode(file->f_dentry->d_inode);
 
 	BUG_ON(iocb->ki_pos != pos);
-
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
-	return rval;
+	return bhv_vop_read(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
 }
 
 STATIC ssize_t
@@ -100,15 +95,12 @@ __xfs_file_write(
 	struct iovec	iov = {(void __user *)buf, count};
 	struct file	*file = iocb->ki_filp;
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
-	ssize_t		rval;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-
-	VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
-	return rval;
+	return bhv_vop_write(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
 }
 
 STATIC ssize_t
@@ -140,7 +132,7 @@ __xfs_file_readv(
 	loff_t			*ppos)
 {
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	struct kiocb	kiocb;
 	ssize_t		rval;
 
@@ -149,7 +141,8 @@ __xfs_file_readv(
 
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+	rval = bhv_vop_read(vp, &kiocb, iov, nr_segs,
+				&kiocb.ki_pos, ioflags, NULL);
 
 	*ppos = kiocb.ki_pos;
 	return rval;
@@ -184,7 +177,7 @@ __xfs_file_writev(
 	loff_t			*ppos)
 {
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	struct kiocb	kiocb;
 	ssize_t		rval;
 
@@ -193,7 +186,8 @@ __xfs_file_writev(
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
 
-	VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+	rval = bhv_vop_write(vp, &kiocb, iov, nr_segs,
+				 &kiocb.ki_pos, ioflags, NULL);
 
 	*ppos = kiocb.ki_pos;
 	return rval;
@@ -227,11 +221,8 @@ xfs_file_sendfile(
 	read_actor_t		actor,
 	void			*target)
 {
-	vnode_t			*vp = vn_from_inode(filp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SENDFILE(vp, filp, pos, 0, count, actor, target, NULL, rval);
-	return rval;
+	return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode),
+				filp, pos, 0, count, actor, target, NULL);
 }
 
 STATIC ssize_t
@@ -242,11 +233,8 @@ xfs_file_sendfile_invis(
 	read_actor_t		actor,
 	void			*target)
 {
-	vnode_t			*vp = vn_from_inode(filp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SENDFILE(vp, filp, pos, IO_INVIS, count, actor, target, NULL, rval);
-	return rval;
+	return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode),
+				filp, pos, IO_INVIS, count, actor, target, NULL);
 }
 
 STATIC ssize_t
@@ -257,11 +245,8 @@ xfs_file_splice_read(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval);
-	return rval;
+	return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode),
+				   infilp, ppos, pipe, len, flags, 0, NULL);
 }
 
 STATIC ssize_t
@@ -272,11 +257,9 @@ xfs_file_splice_read_invis(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval);
-	return rval;
+	return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode),
+				   infilp, ppos, pipe, len, flags, IO_INVIS,
+				   NULL);
 }
 
 STATIC ssize_t
@@ -287,11 +270,8 @@ xfs_file_splice_write(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval);
-	return rval;
+	return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode),
+				    pipe, outfilp, ppos, len, flags, 0, NULL);
 }
 
 STATIC ssize_t
@@ -302,11 +282,9 @@ xfs_file_splice_write_invis(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval);
-	return rval;
+	return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode),
+				    pipe, outfilp, ppos, len, flags, IO_INVIS,
+				    NULL);
 }
 
 STATIC int
@@ -314,13 +292,18 @@ xfs_file_open(
 	struct inode	*inode,
 	struct file	*filp)
 {
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error;
-
 	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
 		return -EFBIG;
-	VOP_OPEN(vp, NULL, error);
-	return -error;
+	return -bhv_vop_open(vn_from_inode(inode), NULL);
+}
+
+STATIC int
+xfs_file_close(
+	struct file	*filp,
+	fl_owner_t	id)
+{
+	return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0,
+				file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL);
 }
 
 STATIC int
@@ -328,12 +311,11 @@ xfs_file_release(
 	struct inode	*inode,
 	struct file	*filp)
 {
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error = 0;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	if (vp)
-		VOP_RELEASE(vp, error);
-	return -error;
+		return -bhv_vop_release(vp);
+	return 0;
 }
 
 STATIC int
@@ -342,15 +324,14 @@ xfs_file_fsync(
 	struct dentry	*dentry,
 	int		datasync)
 {
-	struct inode	*inode = dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error;
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	int		flags = FSYNC_WAIT;
 
 	if (datasync)
 		flags |= FSYNC_DATA;
-	VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
-	return -error;
+	if (VN_TRUNC(vp))
+		VUNTRUNCATE(vp);
+	return -bhv_vop_fsync(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1);
 }
 
 #ifdef CONFIG_XFS_DMAPI
@@ -361,16 +342,11 @@ xfs_vm_nopage(
 	int			*type)
 {
 	struct inode	*inode = area->vm_file->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
-	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-	int		error;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-
-	error = XFS_SEND_MMAP(mp, area, 0);
-	if (error)
+	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0))
 		return NULL;
-
 	return filemap_nopage(area, address, type);
 }
 #endif /* CONFIG_XFS_DMAPI */
@@ -382,7 +358,7 @@ xfs_file_readdir(
 	filldir_t	filldir)
 {
 	int		error = 0;
-	vnode_t		*vp = vn_from_inode(filp->f_dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(filp->f_dentry->d_inode);
 	uio_t		uio;
 	iovec_t		iov;
 	int		eof = 0;
@@ -417,7 +393,7 @@ xfs_file_readdir(
 
 		start_offset = uio.uio_offset;
 
-		VOP_READDIR(vp, &uio, NULL, &eof, error);
+		error = bhv_vop_readdir(vp, &uio, NULL, &eof);
 		if ((uio.uio_offset == start_offset) || error) {
 			size = 0;
 			break;
@@ -456,38 +432,28 @@ xfs_file_mmap(
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
-	struct inode	*ip = filp->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(ip);
-	vattr_t		vattr;
-	int		error;
-
 	vma->vm_ops = &xfs_file_vm_ops;
 
 #ifdef CONFIG_XFS_DMAPI
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
+	if (vn_from_inode(filp->f_dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
 		vma->vm_ops = &xfs_dmapi_file_vm_ops;
-	}
 #endif /* CONFIG_XFS_DMAPI */
 
-	vattr.va_mask = XFS_AT_UPDATIME;
-	VOP_SETATTR(vp, &vattr, XFS_AT_UPDATIME, NULL, error);
-	if (likely(!error))
-		__vn_revalidate(vp, &vattr);	/* update flags */
+	file_accessed(filp);
 	return 0;
 }
 
-
 STATIC long
 xfs_file_ioctl(
 	struct file	*filp,
 	unsigned int	cmd,
-	unsigned long	arg)
+	unsigned long	p)
 {
 	int		error;
 	struct inode	*inode = filp->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
-	VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error);
+	error = bhv_vop_ioctl(vp, inode, filp, 0, cmd, (void __user *)p);
 	VMODIFY(vp);
 
 	/* NOTE:  some of the ioctl's return positive #'s as a
@@ -503,13 +469,13 @@ STATIC long
 xfs_file_ioctl_invis(
 	struct file	*filp,
 	unsigned int	cmd,
-	unsigned long	arg)
+	unsigned long	p)
 {
-	struct inode	*inode = filp->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
 	int		error;
+	struct inode	*inode = filp->f_dentry->d_inode;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
-	VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error);
+	error = bhv_vop_ioctl(vp, inode, filp, IO_INVIS, cmd, (void __user *)p);
 	VMODIFY(vp);
 
 	/* NOTE:  some of the ioctl's return positive #'s as a
@@ -528,7 +494,7 @@ xfs_vm_mprotect(
 	struct vm_area_struct *vma,
 	unsigned int	newflags)
 {
-	vnode_t		*vp = vn_from_inode(vma->vm_file->f_dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(vma->vm_file->f_dentry->d_inode);
 	int		error = 0;
 
 	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
@@ -554,24 +520,19 @@ STATIC int
 xfs_file_open_exec(
 	struct inode	*inode)
 {
-	vnode_t		*vp = vn_from_inode(inode);
-	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-	int		error = 0;
-	xfs_inode_t	*ip;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-		ip = xfs_vtoi(vp);
-		if (!ip) {
-			error = -EINVAL;
-			goto open_exec_out;
-		}
-		if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
-			error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
+	if (unlikely(vp->v_vfsp->vfs_flag & VFS_DMI)) {
+		xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
+		xfs_inode_t	*ip = xfs_vtoi(vp);
+
+		if (!ip)
+			return -EINVAL;
+		if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ))
+			return -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
 					       0, 0, 0, NULL);
-		}
 	}
-open_exec_out:
-	return error;
+	return 0;
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
 
@@ -592,6 +553,7 @@ const struct file_operations xfs_file_operations = {
 #endif
 	.mmap		= xfs_file_mmap,
 	.open		= xfs_file_open,
+	.flush		= xfs_file_close,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
 #ifdef HAVE_FOP_OPEN_EXEC
@@ -616,6 +578,7 @@ const struct file_operations xfs_invis_file_operations = {
 #endif
 	.mmap		= xfs_file_mmap,
 	.open		= xfs_file_open,
+	.flush		= xfs_file_close,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
 };
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 575f2a790f31..dc0562828e76 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -15,40 +15,12 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-
 #include "xfs.h"
 
-/*
- * Stub for no-op vnode operations that return error status.
- */
-int
-fs_noerr(void)
-{
-	return 0;
-}
+int  fs_noerr(void) { return 0; }
+int  fs_nosys(void) { return ENOSYS; }
+void fs_noval(void) { return; }
 
-/*
- * Operation unsupported under this file system.
- */
-int
-fs_nosys(void)
-{
-	return ENOSYS;
-}
-
-/*
- * Stub for inactive, strategy, and read/write lock/unlock.  Does nothing.
- */
-/* ARGSUSED */
-void
-fs_noval(void)
-{
-}
-
-/*
- * vnode pcache layer for vnode_tosspages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 void
 fs_tosspages(
 	bhv_desc_t	*bdp,
@@ -56,18 +28,13 @@ fs_tosspages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
 
 	if (VN_CACHED(vp))
 		truncate_inode_pages(ip->i_mapping, first);
 }
 
-
-/*
- * vnode pcache layer for vnode_flushinval_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 void
 fs_flushinval_pages(
 	bhv_desc_t	*bdp,
@@ -75,20 +42,17 @@ fs_flushinval_pages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
 
 	if (VN_CACHED(vp)) {
+		if (VN_TRUNC(vp))
+			VUNTRUNCATE(vp);
 		filemap_write_and_wait(ip->i_mapping);
-
 		truncate_inode_pages(ip->i_mapping, first);
 	}
 }
 
-/*
- * vnode pcache layer for vnode_flush_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 int
 fs_flush_pages(
 	bhv_desc_t	*bdp,
@@ -97,15 +61,16 @@ fs_flush_pages(
 	uint64_t	flags,
 	int		fiopt)
 {
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
 
-	if (VN_CACHED(vp)) {
+	if (VN_DIRTY(vp)) {
+		if (VN_TRUNC(vp))
+			VUNTRUNCATE(vp);
 		filemap_fdatawrite(ip->i_mapping);
 		if (flags & XFS_B_ASYNC)
 			return 0;
 		filemap_fdatawait(ip->i_mapping);
 	}
-
 	return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 6e8085f34635..6c162c3dde7e 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -45,6 +45,7 @@ xfs_param_t xfs_params = {
 	.xfs_buf_age	= {	1*100,		15*100,		7200*100},
 	.inherit_nosym	= {	0,		0,		1	},
 	.rotorstep	= {	1,		1,		255	},
+	.inherit_nodfrg	= {	0,		1,		1	},
 };
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 84478491609b..6e52a5dd38d8 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -31,7 +30,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
@@ -78,7 +76,7 @@ xfs_find_handle(
 	xfs_handle_t		handle;
 	xfs_fsop_handlereq_t	hreq;
 	struct inode		*inode;
-	struct vnode		*vp;
+	bhv_vnode_t		*vp;
 
 	if (copy_from_user(&hreq, arg, sizeof(hreq)))
 		return -XFS_ERROR(EFAULT);
@@ -192,7 +190,7 @@ xfs_vget_fsop_handlereq(
 	xfs_mount_t		*mp,
 	struct inode		*parinode,	/* parent inode pointer    */
 	xfs_fsop_handlereq_t	*hreq,
-	vnode_t			**vp,
+	bhv_vnode_t		**vp,
 	struct inode		**inode)
 {
 	void			__user *hanp;
@@ -202,7 +200,7 @@ xfs_vget_fsop_handlereq(
 	xfs_handle_t		handle;
 	xfs_inode_t		*ip;
 	struct inode		*inodep;
-	vnode_t			*vpp;
+	bhv_vnode_t		*vpp;
 	xfs_ino_t		ino;
 	__u32			igen;
 	int			error;
@@ -277,7 +275,7 @@ xfs_open_by_handle(
 	struct file		*filp;
 	struct inode		*inode;
 	struct dentry		*dentry;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	xfs_fsop_handlereq_t	hreq;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -362,7 +360,7 @@ xfs_readlink_by_handle(
 	struct uio		auio;
 	struct inode		*inode;
 	xfs_fsop_handlereq_t	hreq;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	__u32			olen;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -393,9 +391,11 @@ xfs_readlink_by_handle(
 	auio.uio_segflg	= UIO_USERSPACE;
 	auio.uio_resid	= olen;
 
-	VOP_READLINK(vp, &auio, IO_INVIS, NULL, error);
-
+	error = bhv_vop_readlink(vp, &auio, IO_INVIS, NULL);
 	VN_RELE(vp);
+	if (error)
+		return -error;
+
 	return (olen - auio.uio_resid);
 }
 
@@ -411,7 +411,7 @@ xfs_fssetdm_by_handle(
 	xfs_fsop_setdm_handlereq_t dmhreq;
 	struct inode		*inode;
 	bhv_desc_t		*bdp;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 
 	if (!capable(CAP_MKNOD))
 		return -XFS_ERROR(EPERM);
@@ -452,7 +452,7 @@ xfs_attrlist_by_handle(
 	attrlist_cursor_kern_t	*cursor;
 	xfs_fsop_attrlist_handlereq_t al_hreq;
 	struct inode		*inode;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -472,8 +472,8 @@ xfs_attrlist_by_handle(
 		goto out_vn_rele;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags,
-			cursor, NULL, error);
+	error = bhv_vop_attr_list(vp, kbuf, al_hreq.buflen, al_hreq.flags,
+					cursor, NULL);
 	if (error)
 		goto out_kfree;
 
@@ -490,7 +490,7 @@ xfs_attrlist_by_handle(
 
 STATIC int
 xfs_attrmulti_attr_get(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	char			*name,
 	char			__user *ubuf,
 	__uint32_t		*len,
@@ -505,7 +505,7 @@ xfs_attrmulti_attr_get(
 	if (!kbuf)
 		return ENOMEM;
 
-	VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error);
+	error = bhv_vop_attr_get(vp, name, kbuf, len, flags, NULL);
 	if (error)
 		goto out_kfree;
 
@@ -519,7 +519,7 @@ xfs_attrmulti_attr_get(
 
 STATIC int
 xfs_attrmulti_attr_set(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	char			*name,
 	const char		__user *ubuf,
 	__uint32_t		len,
@@ -542,7 +542,7 @@ xfs_attrmulti_attr_set(
 	if (copy_from_user(kbuf, ubuf, len))
 		goto out_kfree;
 			
-	VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error);
+	error = bhv_vop_attr_set(vp, name, kbuf, len, flags, NULL);
 
  out_kfree:
 	kfree(kbuf);
@@ -551,20 +551,15 @@ xfs_attrmulti_attr_set(
 
 STATIC int
 xfs_attrmulti_attr_remove(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	char			*name,
 	__uint32_t		flags)
 {
-	int			error;
-
-
 	if (IS_RDONLY(&vp->v_inode))
 		return -EROFS;
 	if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
 		return EPERM;
-
-	VOP_ATTR_REMOVE(vp, name, flags, NULL, error);
-	return error;
+	return bhv_vop_attr_remove(vp, name, flags, NULL);
 }
 
 STATIC int
@@ -578,7 +573,7 @@ xfs_attrmulti_by_handle(
 	xfs_attr_multiop_t	*ops;
 	xfs_fsop_attrmulti_handlereq_t am_hreq;
 	struct inode		*inode;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	unsigned int		i, size;
 	char			*attr_name;
 
@@ -658,7 +653,7 @@ xfs_attrmulti_by_handle(
 STATIC int
 xfs_ioc_space(
 	bhv_desc_t		*bdp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	struct file		*filp,
 	int			flags,
 	unsigned int		cmd,
@@ -682,7 +677,7 @@ xfs_ioc_fsgeometry(
 
 STATIC int
 xfs_ioc_xattr(
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	xfs_inode_t		*ip,
 	struct file		*filp,
 	unsigned int		cmd,
@@ -711,7 +706,7 @@ xfs_ioctl(
 	void			__user *arg)
 {
 	int			error;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	xfs_inode_t		*ip;
 	xfs_mount_t		*mp;
 
@@ -962,7 +957,7 @@ xfs_ioctl(
 STATIC int
 xfs_ioc_space(
 	bhv_desc_t		*bdp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	struct file		*filp,
 	int			ioflags,
 	unsigned int		cmd,
@@ -1153,14 +1148,14 @@ xfs_di2lxflags(
 
 STATIC int
 xfs_ioc_xattr(
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	xfs_inode_t		*ip,
 	struct file		*filp,
 	unsigned int		cmd,
 	void			__user *arg)
 {
 	struct fsxattr		fa;
-	struct vattr		*vattr;
+	struct bhv_vattr	*vattr;
 	int			error = 0;
 	int			attr_flags;
 	unsigned int		flags;
@@ -1173,7 +1168,7 @@ xfs_ioc_xattr(
 	case XFS_IOC_FSGETXATTR: {
 		vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
 				 XFS_AT_NEXTENTS | XFS_AT_PROJID;
-		VOP_GETATTR(vp, vattr, 0, NULL, error);
+		error = bhv_vop_getattr(vp, vattr, 0, NULL);
 		if (unlikely(error)) {
 			error = -error;
 			break;
@@ -1206,7 +1201,7 @@ xfs_ioc_xattr(
 		vattr->va_extsize = fa.fsx_extsize;
 		vattr->va_projid  = fa.fsx_projid;
 
-		VOP_SETATTR(vp, vattr, attr_flags, NULL, error);
+		error = bhv_vop_setattr(vp, vattr, attr_flags, NULL);
 		if (likely(!error))
 			__vn_revalidate(vp, vattr);	/* update flags */
 		error = -error;
@@ -1216,7 +1211,7 @@ xfs_ioc_xattr(
 	case XFS_IOC_FSGETXATTRA: {
 		vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
 				 XFS_AT_ANEXTENTS | XFS_AT_PROJID;
-		VOP_GETATTR(vp, vattr, 0, NULL, error);
+		error = bhv_vop_getattr(vp, vattr, 0, NULL);
 		if (unlikely(error)) {
 			error = -error;
 			break;
@@ -1262,7 +1257,7 @@ xfs_ioc_xattr(
 		vattr->va_xflags = xfs_merge_ioc_xflags(flags,
 							xfs_ip2xflags(ip));
 
-		VOP_SETATTR(vp, vattr, attr_flags, NULL, error);
+		error = bhv_vop_setattr(vp, vattr, attr_flags, NULL);
 		if (likely(!error))
 			__vn_revalidate(vp, vattr);	/* update flags */
 		error = -error;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 251bfe451a3f..601f01c92f7f 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -114,7 +114,7 @@ xfs_compat_ioctl(
 	unsigned long	arg)
 {
 	struct inode	*inode = file->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	int		error;
 
 	switch (cmd) {
@@ -193,7 +193,7 @@ xfs_compat_ioctl(
 		return -ENOIOCTLCMD;
 	}
 
-	VOP_IOCTL(vp, inode, file, mode, cmd, (void __user *)arg, error);
+	error = bhv_vop_ioctl(vp, inode, file, mode, cmd, (void __user *)arg);
 	VMODIFY(vp);
 
 	return error;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 2e2e275c786f..12810baeb5d4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -61,7 +59,7 @@
  */
 xfs_inode_t *
 xfs_vtoi(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	bhv_desc_t      *bdp;
 
@@ -80,7 +78,7 @@ void
 xfs_synchronize_atime(
 	xfs_inode_t	*ip)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = XFS_ITOV_NULL(ip);
 	if (vp) {
@@ -200,14 +198,10 @@ xfs_ichgtime_fast(
 STATIC void
 xfs_validate_fields(
 	struct inode	*ip,
-	struct vattr	*vattr)
+	bhv_vattr_t	*vattr)
 {
-	vnode_t		*vp = vn_from_inode(ip);
-	int		error;
-
 	vattr->va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
-	VOP_GETATTR(vp, vattr, ATTR_LAZY, NULL, error);
-  	if (likely(!error)) {
+	if (!bhv_vop_getattr(vn_from_inode(ip), vattr, ATTR_LAZY, NULL)) {
 		ip->i_nlink = vattr->va_nlink;
 		ip->i_blocks = vattr->va_nblocks;
 
@@ -225,7 +219,7 @@ xfs_validate_fields(
  */
 STATIC int
 xfs_init_security(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	struct inode	*dir)
 {
 	struct inode	*ip = vn_to_inode(vp);
@@ -241,7 +235,7 @@ xfs_init_security(
 		return -error;
 	}
 
-	VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error);
+	error = bhv_vop_attr_set(vp, name, value, length, ATTR_SECURE, NULL);
 	if (!error)
 		VMODIFY(vp);
 
@@ -264,13 +258,12 @@ xfs_has_fs_struct(struct task_struct *task)
 
 STATIC inline void
 xfs_cleanup_inode(
-	vnode_t		*dvp,
-	vnode_t		*vp,
+	bhv_vnode_t	*dvp,
+	bhv_vnode_t	*vp,
 	struct dentry	*dentry,
 	int		mode)
 {
 	struct dentry   teardown = {};
-	int             error;
 
 	/* Oh, the horror.
 	 * If we can't add the ACL or we fail in
@@ -281,9 +274,9 @@ xfs_cleanup_inode(
 	teardown.d_name = dentry->d_name;
 
 	if (S_ISDIR(mode))
-	  	VOP_RMDIR(dvp, &teardown, NULL, error);
+	  	bhv_vop_rmdir(dvp, &teardown, NULL);
 	else
-		VOP_REMOVE(dvp, &teardown, NULL, error);
+		bhv_vop_remove(dvp, &teardown, NULL);
 	VN_RELE(vp);
 }
 
@@ -295,8 +288,8 @@ xfs_vn_mknod(
 	dev_t		rdev)
 {
 	struct inode	*ip;
-	vattr_t		vattr = { 0 };
-	vnode_t		*vp = NULL, *dvp = vn_from_inode(dir);
+	bhv_vattr_t	vattr = { 0 };
+	bhv_vnode_t	*vp = NULL, *dvp = vn_from_inode(dir);
 	xfs_acl_t	*default_acl = NULL;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
 	int		error;
@@ -330,10 +323,10 @@ xfs_vn_mknod(
 		vattr.va_mask |= XFS_AT_RDEV;
 		/*FALLTHROUGH*/
 	case S_IFREG:
-		VOP_CREATE(dvp, dentry, &vattr, &vp, NULL, error);
+		error = bhv_vop_create(dvp, dentry, &vattr, &vp, NULL);
 		break;
 	case S_IFDIR:
-		VOP_MKDIR(dvp, dentry, &vattr, &vp, NULL, error);
+		error = bhv_vop_mkdir(dvp, dentry, &vattr, &vp, NULL);
 		break;
 	default:
 		error = EINVAL;
@@ -396,14 +389,14 @@ xfs_vn_lookup(
 	struct dentry	*dentry,
 	struct nameidata *nd)
 {
-	struct vnode	*vp = vn_from_inode(dir), *cvp;
+	bhv_vnode_t	*vp = vn_from_inode(dir), *cvp;
 	int		error;
 
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
-	if (error) {
+	error = bhv_vop_lookup(vp, dentry, &cvp, 0, NULL, NULL);
+	if (unlikely(error)) {
 		if (unlikely(error != ENOENT))
 			return ERR_PTR(-error);
 		d_add(dentry, NULL);
@@ -420,9 +413,9 @@ xfs_vn_link(
 	struct dentry	*dentry)
 {
 	struct inode	*ip;	/* inode of guy being linked to */
-	vnode_t		*tdvp;	/* target directory for new name/link */
-	vnode_t		*vp;	/* vp of name being linked */
-	vattr_t		vattr;
+	bhv_vnode_t	*tdvp;	/* target directory for new name/link */
+	bhv_vnode_t	*vp;	/* vp of name being linked */
+	bhv_vattr_t	vattr;
 	int		error;
 
 	ip = old_dentry->d_inode;	/* inode being linked to */
@@ -432,7 +425,7 @@ xfs_vn_link(
 	tdvp = vn_from_inode(dir);
 	vp = vn_from_inode(ip);
 
-	VOP_LINK(tdvp, vp, dentry, NULL, error);
+	error = bhv_vop_link(tdvp, vp, dentry, NULL);
 	if (likely(!error)) {
 		VMODIFY(tdvp);
 		VN_HOLD(vp);
@@ -448,14 +441,14 @@ xfs_vn_unlink(
 	struct dentry	*dentry)
 {
 	struct inode	*inode;
-	vnode_t		*dvp;	/* directory containing name to remove */
-	vattr_t		vattr;
+	bhv_vnode_t	*dvp;	/* directory containing name to remove */
+	bhv_vattr_t	vattr;
 	int		error;
 
 	inode = dentry->d_inode;
 	dvp = vn_from_inode(dir);
 
-	VOP_REMOVE(dvp, dentry, NULL, error);
+	error = bhv_vop_remove(dvp, dentry, NULL);
 	if (likely(!error)) {
 		xfs_validate_fields(dir, &vattr);	/* size needs update */
 		xfs_validate_fields(inode, &vattr);
@@ -470,27 +463,26 @@ xfs_vn_symlink(
 	const char	*symname)
 {
 	struct inode	*ip;
-	vattr_t		vattr = { 0 };
-	vnode_t		*dvp;	/* directory containing name of symlink */
-	vnode_t		*cvp;	/* used to lookup symlink to put in dentry */
+	bhv_vattr_t	va = { 0 };
+	bhv_vnode_t	*dvp;	/* directory containing name of symlink */
+	bhv_vnode_t	*cvp;	/* used to lookup symlink to put in dentry */
 	int		error;
 
 	dvp = vn_from_inode(dir);
 	cvp = NULL;
 
-	vattr.va_mode = S_IFLNK |
+	va.va_mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
-	vattr.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
+	va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
 
-	error = 0;
-	VOP_SYMLINK(dvp, dentry, &vattr, (char *)symname, &cvp, NULL, error);
+	error = bhv_vop_symlink(dvp, dentry, &va, (char *)symname, &cvp, NULL);
 	if (likely(!error && cvp)) {
 		error = xfs_init_security(cvp, dir);
 		if (likely(!error)) {
 			ip = vn_to_inode(cvp);
 			d_instantiate(dentry, ip);
-			xfs_validate_fields(dir, &vattr);
-			xfs_validate_fields(ip, &vattr);
+			xfs_validate_fields(dir, &va);
+			xfs_validate_fields(ip, &va);
 		} else {
 			xfs_cleanup_inode(dvp, cvp, dentry, 0);
 		}
@@ -504,11 +496,11 @@ xfs_vn_rmdir(
 	struct dentry	*dentry)
 {
 	struct inode	*inode = dentry->d_inode;
-	vnode_t		*dvp = vn_from_inode(dir);
-	vattr_t		vattr;
+	bhv_vnode_t	*dvp = vn_from_inode(dir);
+	bhv_vattr_t	vattr;
 	int		error;
 
-	VOP_RMDIR(dvp, dentry, NULL, error);
+	error = bhv_vop_rmdir(dvp, dentry, NULL);
 	if (likely(!error)) {
 		xfs_validate_fields(inode, &vattr);
 		xfs_validate_fields(dir, &vattr);
@@ -524,15 +516,15 @@ xfs_vn_rename(
 	struct dentry	*ndentry)
 {
 	struct inode	*new_inode = ndentry->d_inode;
-	vnode_t		*fvp;	/* from directory */
-	vnode_t		*tvp;	/* target directory */
-	vattr_t		vattr;
+	bhv_vnode_t	*fvp;	/* from directory */
+	bhv_vnode_t	*tvp;	/* target directory */
+	bhv_vattr_t	vattr;
 	int		error;
 
 	fvp = vn_from_inode(odir);
 	tvp = vn_from_inode(ndir);
 
-	VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
+	error = bhv_vop_rename(fvp, odentry, tvp, ndentry, NULL);
 	if (likely(!error)) {
 		if (new_inode)
 			xfs_validate_fields(new_inode, &vattr);
@@ -553,7 +545,7 @@ xfs_vn_follow_link(
 	struct dentry		*dentry,
 	struct nameidata	*nd)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	uio_t			*uio;
 	iovec_t			iov;
 	int			error;
@@ -586,8 +578,8 @@ xfs_vn_follow_link(
 	uio->uio_resid = MAXPATHLEN;
 	uio->uio_iovcnt = 1;
 
-	VOP_READLINK(vp, uio, 0, NULL, error);
-	if (error) {
+	error = bhv_vop_readlink(vp, uio, 0, NULL);
+	if (unlikely(error)) {
 		kfree(link);
 		link = ERR_PTR(-error);
 	} else {
@@ -618,12 +610,7 @@ xfs_vn_permission(
 	int		mode,
 	struct nameidata *nd)
 {
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error;
-
-	mode <<= 6;		/* convert from linux to vnode access bits */
-	VOP_ACCESS(vp, mode, NULL, error);
-	return -error;
+	return -bhv_vop_access(vn_from_inode(inode), mode << 6, NULL);
 }
 #else
 #define xfs_vn_permission NULL
@@ -636,14 +623,14 @@ xfs_vn_getattr(
 	struct kstat	*stat)
 {
 	struct inode	*inode = dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	int		error = 0;
 
 	if (unlikely(vp->v_flag & VMODIFIED))
 		error = vn_revalidate(vp);
 	if (!error)
 		generic_fillattr(inode, stat);
-	return 0;
+	return -error;
 }
 
 STATIC int
@@ -653,8 +640,8 @@ xfs_vn_setattr(
 {
 	struct inode	*inode = dentry->d_inode;
 	unsigned int	ia_valid = attr->ia_valid;
-	vnode_t		*vp = vn_from_inode(inode);
-	vattr_t		vattr = { 0 };
+	bhv_vnode_t	*vp = vn_from_inode(inode);
+	bhv_vattr_t	vattr = { 0 };
 	int		flags = 0;
 	int		error;
 
@@ -697,7 +684,7 @@ xfs_vn_setattr(
 		flags |= ATTR_NONBLOCK;
 #endif
 
-	VOP_SETATTR(vp, &vattr, flags, NULL, error);
+	error = bhv_vop_setattr(vp, &vattr, flags, NULL);
 	if (likely(!error))
 		__vn_revalidate(vp, &vattr);
 	return -error;
@@ -718,7 +705,7 @@ xfs_vn_setxattr(
 	size_t		size,
 	int		flags)
 {
-	vnode_t		*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	char		*attr = (char *)name;
 	attrnames_t	*namesp;
 	int		xflags = 0;
@@ -748,7 +735,7 @@ xfs_vn_getxattr(
 	void		*data,
 	size_t		size)
 {
-	vnode_t		*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	char		*attr = (char *)name;
 	attrnames_t	*namesp;
 	int		xflags = 0;
@@ -777,7 +764,7 @@ xfs_vn_listxattr(
 	char			*data,
 	size_t			size)
 {
-	vnode_t			*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t		*vp = vn_from_inode(dentry->d_inode);
 	int			error, xflags = ATTR_KERNAMELS;
 	ssize_t			result;
 
@@ -796,7 +783,7 @@ xfs_vn_removexattr(
 	struct dentry	*dentry,
 	const char	*name)
 {
-	vnode_t		*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	char		*attr = (char *)name;
 	attrnames_t	*namesp;
 	int		xflags = 0;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index e9fe43d74768..aa26ab906c88 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -134,14 +134,21 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define xfs_buf_age_centisecs	xfs_params.xfs_buf_age.val
 #define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
 #define xfs_rotorstep		xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 
-#ifndef raw_smp_processor_id
-#define raw_smp_processor_id()	smp_processor_id()
-#endif
-#define current_cpu()		raw_smp_processor_id()
+#define current_cpu()		(raw_smp_processor_id())
 #define current_pid()		(current->pid)
 #define current_fsuid(cred)	(current->fsuid)
 #define current_fsgid(cred)	(current->fsgid)
+#define current_set_flags(f)	(current->flags |= (f))
+#define current_test_flags(f)	(current->flags & (f))
+#define current_clear_flags(f)	(current->flags & ~(f))
+#define current_set_flags_nested(sp, f)		\
+		(*(sp) = current->flags, current->flags |= (f))
+#define current_clear_flags_nested(sp, f)	\
+		(*(sp) = current->flags, current->flags &= ~(f))
+#define current_restore_flags_nested(sp, f)	\
+		(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
 
 #define NBPP		PAGE_SIZE
 #define DPPSHFT		(PAGE_SHIFT - 9)
@@ -187,25 +194,9 @@ BUFFER_FNS(PrivateStart, unwritten);
 /* bytes to clicks */
 #define btoc(x)         (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
 
-#ifndef ENOATTR
 #define ENOATTR		ENODATA		/* Attribute not found */
-#endif
-
-/* Note: EWRONGFS never visible outside the kernel */
-#define	EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
-
-/*
- * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
- *     return codes out of its known range in errno.
- * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
- *     conflict with any code we use already or any code a driver may use)
- * XXX Some options (currently we do #2):
- *	1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
- *	2/ 990 ["Unknown error 990"]
- *	3/ EUCLEAN ["Structure needs cleaning"]
- *	4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
- */
-#define EFSCORRUPTED    990		/* Filesystem is corrupted */
+#define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
+#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
 
 #define SYNCHRONIZE()	barrier()
 #define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 67efe3308980..5d9cfd91ad08 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -206,7 +204,7 @@ xfs_read(
 	xfs_fsize_t		n;
 	xfs_inode_t		*ip;
 	xfs_mount_t		*mp;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	unsigned long		seg;
 
 	ip = XFS_BHVTOI(bdp);
@@ -258,7 +256,7 @@ xfs_read(
 
 	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
 	    !(ioflags & IO_INVIS)) {
-		vrwlock_t locktype = VRWLOCK_READ;
+		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 
 		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
@@ -271,7 +269,7 @@ xfs_read(
 	}
 
 	if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp)))
-		VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(*offset)),
+		bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
 						-1, FI_REMAPF_LOCKED);
 
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
@@ -313,7 +311,7 @@ xfs_sendfile(
 
 	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
 	    (!(ioflags & IO_INVIS))) {
-		vrwlock_t locktype = VRWLOCK_READ;
+		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
@@ -357,7 +355,7 @@ xfs_splice_read(
 
 	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
 	    (!(ioflags & IO_INVIS))) {
-		vrwlock_t locktype = VRWLOCK_READ;
+		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
@@ -401,7 +399,7 @@ xfs_splice_write(
 
 	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) &&
 	    (!(ioflags & IO_INVIS))) {
-		vrwlock_t locktype = VRWLOCK_WRITE;
+		bhv_vrwlock_t locktype = VRWLOCK_WRITE;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp),
@@ -458,7 +456,7 @@ xfs_zero_last_block(
 	last_fsb = XFS_B_TO_FSBT(mp, isize);
 	nimaps = 1;
 	error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
-			  &nimaps, NULL);
+			  &nimaps, NULL, NULL);
 	if (error) {
 		return error;
 	}
@@ -499,7 +497,7 @@ xfs_zero_last_block(
 
 int					/* error (positive) */
 xfs_zero_eof(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_iocore_t	*io,
 	xfs_off_t	offset,		/* starting I/O offset */
 	xfs_fsize_t	isize,		/* current inode size */
@@ -510,7 +508,6 @@ xfs_zero_eof(
 	xfs_fileoff_t	end_zero_fsb;
 	xfs_fileoff_t	zero_count_fsb;
 	xfs_fileoff_t	last_fsb;
-	xfs_extlen_t	buf_len_fsb;
 	xfs_mount_t	*mp = io->io_mount;
 	int		nimaps;
 	int		error = 0;
@@ -556,7 +553,7 @@ xfs_zero_eof(
 		nimaps = 1;
 		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 		error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
-				  0, NULL, 0, &imap, &nimaps, NULL);
+				  0, NULL, 0, &imap, &nimaps, NULL, NULL);
 		if (error) {
 			ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 			ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
@@ -579,16 +576,7 @@ xfs_zero_eof(
 		}
 
 		/*
-		 * There are blocks in the range requested.
-		 * Zero them a single write at a time.  We actually
-		 * don't zero the entire range returned if it is
-		 * too big and simply loop around to get the rest.
-		 * That is not the most efficient thing to do, but it
-		 * is simple and this path should not be exercised often.
-		 */
-		buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
-					      mp->m_writeio_blocks << 8);
-		/*
+		 * There are blocks we need to zero.
 		 * Drop the inode lock while we're doing the I/O.
 		 * We'll still have the iolock to protect us.
 		 */
@@ -596,14 +584,13 @@ xfs_zero_eof(
 
 		error = xfs_iozero(ip,
 				   XFS_FSB_TO_B(mp, start_zero_fsb),
-				   XFS_FSB_TO_B(mp, buf_len_fsb),
+				   XFS_FSB_TO_B(mp, imap.br_blockcount),
 				   end_size);
-
 		if (error) {
 			goto out_lock;
 		}
 
-		start_zero_fsb = imap.br_startoff + buf_len_fsb;
+		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 
 		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
@@ -637,11 +624,11 @@ xfs_write(
 	ssize_t			ret = 0, error = 0;
 	xfs_fsize_t		isize, new_size;
 	xfs_iocore_t		*io;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	unsigned long		seg;
 	int			iolock;
 	int			eventsent = 0;
-	vrwlock_t		locktype;
+	bhv_vrwlock_t		locktype;
 	size_t			ocount = 0, count;
 	loff_t			pos;
 	int			need_i_mutex = 1, need_flush = 0;
@@ -679,11 +666,11 @@ xfs_write(
 	io = &xip->i_iocore;
 	mp = io->io_mount;
 
+	vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE);
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
-
 	if (ioflags & IO_ISDIRECT) {
 		xfs_buftarg_t	*target =
 			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
@@ -814,7 +801,7 @@ retry:
 		if (need_flush) {
 			xfs_inval_cached_trace(io, pos, -1,
 					ctooff(offtoct(pos)), -1);
-			VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
+			bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
 					-1, FI_REMAPF_LOCKED);
 		}
 
@@ -903,79 +890,9 @@ retry:
 
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
-		/*
-		 * If we're treating this as O_DSYNC and we have not updated the
-		 * size, force the log.
-		 */
-		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-		    !(xip->i_update_size)) {
-			xfs_inode_log_item_t	*iip = xip->i_itemp;
-
-			/*
-			 * If an allocation transaction occurred
-			 * without extending the size, then we have to force
-			 * the log up the proper point to ensure that the
-			 * allocation is permanent.  We can't count on
-			 * the fact that buffered writes lock out direct I/O
-			 * writes - the direct I/O write could have extended
-			 * the size nontransactionally, then finished before
-			 * we started.  xfs_write_file will think that the file
-			 * didn't grow but the update isn't safe unless the
-			 * size change is logged.
-			 *
-			 * Force the log if we've committed a transaction
-			 * against the inode or if someone else has and
-			 * the commit record hasn't gone to disk (e.g.
-			 * the inode is pinned).  This guarantees that
-			 * all changes affecting the inode are permanent
-			 * when we return.
-			 */
-			if (iip && iip->ili_last_lsn) {
-				xfs_log_force(mp, iip->ili_last_lsn,
-						XFS_LOG_FORCE | XFS_LOG_SYNC);
-			} else if (xfs_ipincount(xip) > 0) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-						XFS_LOG_FORCE | XFS_LOG_SYNC);
-			}
-
-		} else {
-			xfs_trans_t	*tp;
-
-			/*
-			 * O_SYNC or O_DSYNC _with_ a size update are handled
-			 * the same way.
-			 *
-			 * If the write was synchronous then we need to make
-			 * sure that the inode modification time is permanent.
-			 * We'll have updated the timestamp above, so here
-			 * we use a synchronous transaction to log the inode.
-			 * It's not fast, but it's necessary.
-			 *
-			 * If this a dsync write and the size got changed
-			 * non-transactionally, then we need to ensure that
-			 * the size change gets logged in a synchronous
-			 * transaction.
-			 */
-
-			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-			if ((error = xfs_trans_reserve(tp, 0,
-						      XFS_SWRITE_LOG_RES(mp),
-						      0, 0, 0))) {
-				/* Transaction reserve failed */
-				xfs_trans_cancel(tp, 0);
-			} else {
-				/* Transaction reserve successful */
-				xfs_ilock(xip, XFS_ILOCK_EXCL);
-				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
-				xfs_trans_ihold(tp, xip);
-				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
-				xfs_trans_set_sync(tp);
-				error = xfs_trans_commit(tp, 0, NULL);
-				xfs_iunlock(xip, XFS_ILOCK_EXCL);
-			}
-			if (error)
-				goto out_unlock_internal;
-		}
+		error = xfs_write_sync_logforce(mp, xip);
+		if (error)
+			goto out_unlock_internal;
 
 		xfs_rwunlock(bdp, locktype);
 		if (need_i_mutex)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index 8f4539952350..c77e62efb742 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -18,8 +18,8 @@
 #ifndef __XFS_LRW_H__
 #define __XFS_LRW_H__
 
-struct vnode;
 struct bhv_desc;
+struct bhv_vnode;
 struct xfs_mount;
 struct xfs_iocore;
 struct xfs_inode;
@@ -49,7 +49,7 @@ struct xfs_iomap;
 #define	XFS_CTRUNC4		14
 #define	XFS_CTRUNC5		15
 #define	XFS_CTRUNC6		16
-#define	XFS_BUNMAPI		17
+#define	XFS_BUNMAP		17
 #define	XFS_INVAL_CACHED	18
 #define	XFS_DIORD_ENTER		19
 #define	XFS_DIOWR_ENTER		20
@@ -82,7 +82,7 @@ extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
 
-extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t,
+extern int xfs_zero_eof(struct bhv_vnode *, struct xfs_iocore *, xfs_off_t,
 				xfs_fsize_t, xfs_fsize_t);
 extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
 				const struct iovec *, unsigned int,
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 1f0589a05eca..e480b6102051 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -62,7 +62,7 @@ xfs_read_xfsstats(
 		while (j < xstats[i].endpoint) {
 			val = 0;
 			/* sum over all cpus */
-			for_each_cpu(c)
+			for_each_possible_cpu(c)
 				val += *(((__u32*)&per_cpu(xfsstats, c) + j));
 			len += sprintf(buffer + len, " %u", val);
 			j++;
@@ -70,7 +70,7 @@ xfs_read_xfsstats(
 		buffer[len++] = '\n';
 	}
 	/* extra precision counters */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
 		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
 		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 68f4793e8a11..9bdef9d51900 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -151,7 +149,7 @@ xfs_set_inodeops(
 STATIC __inline__ void
 xfs_revalidate_inode(
 	xfs_mount_t		*mp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	xfs_inode_t		*ip)
 {
 	struct inode		*inode = vn_to_inode(vp);
@@ -206,7 +204,7 @@ xfs_revalidate_inode(
 void
 xfs_initialize_vnode(
 	bhv_desc_t		*bdp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	bhv_desc_t		*inode_bhv,
 	int			unlock)
 {
@@ -336,7 +334,7 @@ STATIC struct inode *
 xfs_fs_alloc_inode(
 	struct super_block	*sb)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 
 	vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
 	if (unlikely(!vp))
@@ -359,13 +357,13 @@ xfs_fs_inode_init_once(
 {
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 		      SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(vn_to_inode((vnode_t *)vnode));
+		inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
 }
 
 STATIC int
 xfs_init_zones(void)
 {
-	xfs_vnode_zone = kmem_zone_init_flags(sizeof(vnode_t), "xfs_vnode_t",
+	xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
 					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
 					KM_ZONE_SPREAD,
 					xfs_fs_inode_init_once);
@@ -409,22 +407,17 @@ xfs_fs_write_inode(
 	struct inode		*inode,
 	int			sync)
 {
-	vnode_t			*vp = vn_from_inode(inode);
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 	int			error = 0, flags = FLUSH_INODE;
 
 	if (vp) {
 		vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 		if (sync)
 			flags |= FLUSH_SYNC;
-		VOP_IFLUSH(vp, flags, error);
-		if (error == EAGAIN) {
-			if (sync)
-				VOP_IFLUSH(vp, flags | FLUSH_LOG, error);
-			else
-				error = 0;
-		}
+		error = bhv_vop_iflush(vp, flags);
+		if (error == EAGAIN)
+			error = sync? bhv_vop_iflush(vp, flags | FLUSH_LOG) : 0;
 	}
-
 	return -error;
 }
 
@@ -432,8 +425,7 @@ STATIC void
 xfs_fs_clear_inode(
 	struct inode		*inode)
 {
-	vnode_t			*vp = vn_from_inode(inode);
-	int			error, cache;
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 
@@ -446,20 +438,18 @@ xfs_fs_clear_inode(
 	 * This can happen because xfs_iget_core calls xfs_idestroy if we
 	 * find an inode with di_mode == 0 but without IGET_CREATE set.
 	 */
-	if (vp->v_fbhv)
-		VOP_INACTIVE(vp, NULL, cache);
+	if (VNHEAD(vp))
+		bhv_vop_inactive(vp, NULL);
 
 	VN_LOCK(vp);
 	vp->v_flag &= ~VMODIFIED;
 	VN_UNLOCK(vp, 0);
 
-	if (vp->v_fbhv) {
-		VOP_RECLAIM(vp, error);
-		if (error)
-			panic("vn_purge: cannot reclaim");
-	}
+	if (VNHEAD(vp))
+		if (bhv_vop_reclaim(vp))
+			panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, vp);
 
-	ASSERT(vp->v_fbhv == NULL);
+	ASSERT(VNHEAD(vp) == NULL);
 
 #ifdef XFS_VNODE_TRACE
 	ktrace_free(vp->v_trace);
@@ -475,13 +465,13 @@ xfs_fs_clear_inode(
  */
 STATIC void
 xfs_syncd_queue_work(
-	struct vfs	*vfs,
+	struct bhv_vfs	*vfs,
 	void		*data,
-	void		(*syncer)(vfs_t *, void *))
+	void		(*syncer)(bhv_vfs_t *, void *))
 {
-	vfs_sync_work_t	*work;
+	struct bhv_vfs_sync_work *work;
 
-	work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP);
+	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
 	INIT_LIST_HEAD(&work->w_list);
 	work->w_syncer = syncer;
 	work->w_data = data;
@@ -500,7 +490,7 @@ xfs_syncd_queue_work(
  */
 STATIC void
 xfs_flush_inode_work(
-	vfs_t		*vfs,
+	bhv_vfs_t	*vfs,
 	void		*inode)
 {
 	filemap_flush(((struct inode *)inode)->i_mapping);
@@ -512,7 +502,7 @@ xfs_flush_inode(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = vn_to_inode(XFS_ITOV(ip));
-	struct vfs	*vfs = XFS_MTOVFS(ip->i_mount);
+	struct bhv_vfs	*vfs = XFS_MTOVFS(ip->i_mount);
 
 	igrab(inode);
 	xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
@@ -525,7 +515,7 @@ xfs_flush_inode(
  */
 STATIC void
 xfs_flush_device_work(
-	vfs_t		*vfs,
+	bhv_vfs_t	*vfs,
 	void		*inode)
 {
 	sync_blockdev(vfs->vfs_super->s_bdev);
@@ -537,7 +527,7 @@ xfs_flush_device(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = vn_to_inode(XFS_ITOV(ip));
-	struct vfs	*vfs = XFS_MTOVFS(ip->i_mount);
+	struct bhv_vfs	*vfs = XFS_MTOVFS(ip->i_mount);
 
 	igrab(inode);
 	xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
@@ -545,16 +535,16 @@ xfs_flush_device(
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
 
-#define SYNCD_FLAGS	(SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR|SYNC_REFCACHE)
 STATIC void
 vfs_sync_worker(
-	vfs_t		*vfsp,
+	bhv_vfs_t	*vfsp,
 	void		*unused)
 {
 	int		error;
 
 	if (!(vfsp->vfs_flag & VFS_RDONLY))
-		VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error);
+		error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \
+					SYNC_ATTR | SYNC_REFCACHE, NULL);
 	vfsp->vfs_sync_seq++;
 	wmb();
 	wake_up(&vfsp->vfs_wait_single_sync_task);
@@ -565,8 +555,8 @@ xfssyncd(
 	void			*arg)
 {
 	long			timeleft;
-	vfs_t			*vfsp = (vfs_t *) arg;
-	struct vfs_sync_work	*work, *n;
+	bhv_vfs_t		*vfsp = (bhv_vfs_t *) arg;
+	bhv_vfs_sync_work_t	*work, *n;
 	LIST_HEAD		(tmp);
 
 	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
@@ -600,7 +590,7 @@ xfssyncd(
 			list_del(&work->w_list);
 			if (work == &vfsp->vfs_sync_work)
 				continue;
-			kmem_free(work, sizeof(struct vfs_sync_work));
+			kmem_free(work, sizeof(struct bhv_vfs_sync_work));
 		}
 	}
 
@@ -609,7 +599,7 @@ xfssyncd(
 
 STATIC int
 xfs_fs_start_syncd(
-	vfs_t			*vfsp)
+	bhv_vfs_t		*vfsp)
 {
 	vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
 	vfsp->vfs_sync_work.w_vfs = vfsp;
@@ -621,7 +611,7 @@ xfs_fs_start_syncd(
 
 STATIC void
 xfs_fs_stop_syncd(
-	vfs_t			*vfsp)
+	bhv_vfs_t		*vfsp)
 {
 	kthread_stop(vfsp->vfs_sync_task);
 }
@@ -630,35 +620,26 @@ STATIC void
 xfs_fs_put_super(
 	struct super_block	*sb)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
 	int			error;
 
 	xfs_fs_stop_syncd(vfsp);
-	VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error);
-	if (!error)
-		VFS_UNMOUNT(vfsp, 0, NULL, error);
+	bhv_vfs_sync(vfsp, SYNC_ATTR | SYNC_DELWRI, NULL);
+	error = bhv_vfs_unmount(vfsp, 0, NULL);
 	if (error) {
-		printk("XFS unmount got error %d\n", error);
-		printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp);
-		return;
+		printk("XFS: unmount got error=%d\n", error);
+		printk("%s: vfs=0x%p left dangling!\n", __FUNCTION__, vfsp);
+	} else {
+		vfs_deallocate(vfsp);
 	}
-
-	vfs_deallocate(vfsp);
 }
 
 STATIC void
 xfs_fs_write_super(
 	struct super_block	*sb)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	if (sb->s_flags & MS_RDONLY) {
-		sb->s_dirt = 0; /* paranoia */
-		return;
-	}
-	/* Push the log and superblock a little */
-	VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error);
+	if (!(sb->s_flags & MS_RDONLY))
+		bhv_vfs_sync(vfs_from_sb(sb), SYNC_FSDATA, NULL);
 	sb->s_dirt = 0;
 }
 
@@ -667,16 +648,16 @@ xfs_fs_sync_super(
 	struct super_block	*sb,
 	int			wait)
 {
-	vfs_t		*vfsp = vfs_from_sb(sb);
-	int		error;
-	int		flags = SYNC_FSDATA;
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
+	int			error;
+	int			flags;
 
 	if (unlikely(sb->s_frozen == SB_FREEZE_WRITE))
 		flags = SYNC_QUIESCE;
 	else
 		flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0);
 
-	VFS_SYNC(vfsp, flags, NULL, error);
+	error = bhv_vfs_sync(vfsp, flags, NULL);
 	sb->s_dirt = 0;
 
 	if (unlikely(laptop_mode)) {
@@ -703,14 +684,11 @@ xfs_fs_sync_super(
 
 STATIC int
 xfs_fs_statfs(
-	struct super_block	*sb,
+	struct dentry		*dentry,
 	struct kstatfs		*statp)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_STATVFS(vfsp, statp, NULL, error);
-	return -error;
+	return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp,
+				vn_from_inode(dentry->d_inode));
 }
 
 STATIC int
@@ -719,13 +697,13 @@ xfs_fs_remount(
 	int			*flags,
 	char			*options)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
 	struct xfs_mount_args	*args = xfs_args_allocate(sb, 0);
 	int			error;
 
-	VFS_PARSEARGS(vfsp, options, args, 1, error);
+	error = bhv_vfs_parseargs(vfsp, options, args, 1);
 	if (!error)
-		VFS_MNTUPDATE(vfsp, flags, args, error);
+		error = bhv_vfs_mntupdate(vfsp, flags, args);
 	kmem_free(args, sizeof(*args));
 	return -error;
 }
@@ -734,7 +712,7 @@ STATIC void
 xfs_fs_lockfs(
 	struct super_block	*sb)
 {
-	VFS_FREEZE(vfs_from_sb(sb));
+	bhv_vfs_freeze(vfs_from_sb(sb));
 }
 
 STATIC int
@@ -742,11 +720,7 @@ xfs_fs_show_options(
 	struct seq_file		*m,
 	struct vfsmount		*mnt)
 {
-	struct vfs		*vfsp = vfs_from_sb(mnt->mnt_sb);
-	int			error;
-
-	VFS_SHOWARGS(vfsp, m, error);
-	return error;
+	return -bhv_vfs_showargs(vfs_from_sb(mnt->mnt_sb), m);
 }
 
 STATIC int
@@ -754,11 +728,7 @@ xfs_fs_quotasync(
 	struct super_block	*sb,
 	int			type)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, Q_XQUOTASYNC, 0, (caddr_t)NULL, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XQUOTASYNC, 0, NULL);
 }
 
 STATIC int
@@ -766,11 +736,7 @@ xfs_fs_getxstate(
 	struct super_block	*sb,
 	struct fs_quota_stat	*fqs)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
 }
 
 STATIC int
@@ -779,11 +745,7 @@ xfs_fs_setxstate(
 	unsigned int		flags,
 	int			op)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb), op, 0, (caddr_t)&flags);
 }
 
 STATIC int
@@ -793,13 +755,10 @@ xfs_fs_getxquota(
 	qid_t			id,
 	struct fs_disk_quota	*fdq)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error, getmode;
-
-	getmode = (type == USRQUOTA) ? Q_XGETQUOTA :
-		 ((type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETPQUOTA);
-	VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb),
+				 (type == USRQUOTA) ? Q_XGETQUOTA :
+				  ((type == GRPQUOTA) ? Q_XGETGQUOTA :
+				   Q_XGETPQUOTA), id, (caddr_t)fdq);
 }
 
 STATIC int
@@ -809,13 +768,10 @@ xfs_fs_setxquota(
 	qid_t			id,
 	struct fs_disk_quota	*fdq)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error, setmode;
-
-	setmode = (type == USRQUOTA) ? Q_XSETQLIM :
-		 ((type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETPQLIM);
-	VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb),
+				 (type == USRQUOTA) ? Q_XSETQLIM :
+				  ((type == GRPQUOTA) ? Q_XSETGQLIM :
+				   Q_XSETPQLIM), id, (caddr_t)fdq);
 }
 
 STATIC int
@@ -824,34 +780,32 @@ xfs_fs_fill_super(
 	void			*data,
 	int			silent)
 {
-	vnode_t			*rootvp;
-	struct vfs		*vfsp = vfs_allocate(sb);
+	struct bhv_vnode	*rootvp;
+	struct bhv_vfs		*vfsp = vfs_allocate(sb);
 	struct xfs_mount_args	*args = xfs_args_allocate(sb, silent);
 	struct kstatfs		statvfs;
-	int			error, error2;
+	int			error;
 
 	bhv_insert_all_vfsops(vfsp);
 
-	VFS_PARSEARGS(vfsp, (char *)data, args, 0, error);
+	error = bhv_vfs_parseargs(vfsp, (char *)data, args, 0);
 	if (error) {
 		bhv_remove_all_vfsops(vfsp, 1);
 		goto fail_vfsop;
 	}
 
 	sb_min_blocksize(sb, BBSIZE);
-#ifdef CONFIG_XFS_EXPORT
 	sb->s_export_op = &xfs_export_operations;
-#endif
 	sb->s_qcop = &xfs_quotactl_operations;
 	sb->s_op = &xfs_super_operations;
 
-	VFS_MOUNT(vfsp, args, NULL, error);
+	error = bhv_vfs_mount(vfsp, args, NULL);
 	if (error) {
 		bhv_remove_all_vfsops(vfsp, 1);
 		goto fail_vfsop;
 	}
 
-	VFS_STATVFS(vfsp, &statvfs, NULL, error);
+	error = bhv_vfs_statvfs(vfsp, &statvfs, NULL);
 	if (error)
 		goto fail_unmount;
 
@@ -863,7 +817,7 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	VFS_ROOT(vfsp, &rootvp, error);
+	error = bhv_vfs_root(vfsp, &rootvp);
 	if (error)
 		goto fail_unmount;
 
@@ -892,7 +846,7 @@ fail_vnrele:
 	}
 
 fail_unmount:
-	VFS_UNMOUNT(vfsp, 0, NULL, error2);
+	bhv_vfs_unmount(vfsp, 0, NULL);
 
 fail_vfsop:
 	vfs_deallocate(vfsp);
@@ -900,14 +854,16 @@ fail_vfsop:
 	return -error;
 }
 
-STATIC struct super_block *
+STATIC int
 xfs_fs_get_sb(
 	struct file_system_type	*fs_type,
 	int			flags,
 	const char		*dev_name,
-	void			*data)
+	void			*data,
+	struct vfsmount		*mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+			   mnt);
 }
 
 STATIC struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 376b96cb513a..33dd1ca13245 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -105,7 +105,7 @@ struct block_device;
 
 extern __uint64_t xfs_max_file_offset(unsigned int);
 
-extern void xfs_initialize_vnode(bhv_desc_t *, vnode_t *, bhv_desc_t *, int);
+extern void xfs_initialize_vnode(bhv_desc_t *, bhv_vnode_t *, bhv_desc_t *, int);
 
 extern void xfs_flush_inode(struct xfs_inode *);
 extern void xfs_flush_device(struct xfs_inode *);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7079cc837210..af246532fbfb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -38,7 +38,7 @@ xfs_stats_clear_proc_handler(
 
 	if (!ret && write && *valp) {
 		printk("XFS Clearing xfsstats\n");
-		for_each_cpu(c) {
+		for_each_possible_cpu(c) {
 			preempt_disable();
 			/* save vn_active, it's a universal truth! */
 			vn_active = per_cpu(xfsstats, c).vn_active;
@@ -120,6 +120,11 @@ STATIC ctl_table xfs_table[] = {
 	&sysctl_intvec, NULL,
 	&xfs_params.rotorstep.min, &xfs_params.rotorstep.max},
 
+	{XFS_INHERIT_NODFRG, "inherit_nodefrag", &xfs_params.inherit_nodfrg.val,
+	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	&sysctl_intvec, NULL,
+	&xfs_params.inherit_nodfrg.min, &xfs_params.inherit_nodfrg.max},
+
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index bc8c11f13722..a631fb8cc5ac 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -46,6 +46,7 @@ typedef struct xfs_param {
 	xfs_sysctl_val_t xfs_buf_age;	/* Metadata buffer age before flush. */
 	xfs_sysctl_val_t inherit_nosym;	/* Inherit the "nosymlinks" flag. */
 	xfs_sysctl_val_t rotorstep;	/* inode32 AG rotoring control knob */
+	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 } xfs_param_t;
 
 /*
@@ -84,6 +85,7 @@ enum {
 	/* XFS_IO_BYPASS = 18 */
 	XFS_INHERIT_NOSYM = 19,
 	XFS_ROTORSTEP = 20,
+	XFS_INHERIT_NODFRG = 21,
 };
 
 extern xfs_param_t	xfs_params;
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index 6f7c9f7a8624..6145e8bd0be2 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_imap.h"
 #include "xfs_alloc.h"
@@ -104,7 +103,7 @@ vfs_mntupdate(
 int
 vfs_root(
 	struct bhv_desc		*bdp,
-	struct vnode		**vpp)
+	struct bhv_vnode	**vpp)
 {
 	struct bhv_desc		*next = bdp;
 
@@ -117,15 +116,15 @@ vfs_root(
 int
 vfs_statvfs(
 	struct bhv_desc		*bdp,
-	xfs_statfs_t		*sp,
-	struct vnode		*vp)
+	bhv_statvfs_t		*statp,
+	struct bhv_vnode	*vp)
 {
 	struct bhv_desc		*next = bdp;
 
 	ASSERT(next);
 	while (! (bhvtovfsops(next))->vfs_statvfs)
 		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp));
+	return ((*bhvtovfsops(next)->vfs_statvfs)(next, statp, vp));
 }
 
 int
@@ -145,7 +144,7 @@ vfs_sync(
 int
 vfs_vget(
 	struct bhv_desc		*bdp,
-	struct vnode		**vpp,
+	struct bhv_vnode	**vpp,
 	struct fid		*fidp)
 {
 	struct bhv_desc		*next = bdp;
@@ -187,7 +186,7 @@ vfs_quotactl(
 void
 vfs_init_vnode(
 	struct bhv_desc		*bdp,
-	struct vnode		*vp,
+	struct bhv_vnode	*vp,
 	struct bhv_desc		*bp,
 	int			unlock)
 {
@@ -226,13 +225,13 @@ vfs_freeze(
 	((*bhvtovfsops(next)->vfs_freeze)(next));
 }
 
-vfs_t *
+bhv_vfs_t *
 vfs_allocate(
 	struct super_block	*sb)
 {
-	struct vfs		*vfsp;
+	struct bhv_vfs		*vfsp;
 
-	vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP);
+	vfsp = kmem_zalloc(sizeof(bhv_vfs_t), KM_SLEEP);
 	bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
 	INIT_LIST_HEAD(&vfsp->vfs_sync_list);
 	spin_lock_init(&vfsp->vfs_sync_lock);
@@ -247,25 +246,25 @@ vfs_allocate(
 	return vfsp;
 }
 
-vfs_t *
+bhv_vfs_t *
 vfs_from_sb(
 	struct super_block	*sb)
 {
-	return (vfs_t *)sb->s_fs_info;
+	return (bhv_vfs_t *)sb->s_fs_info;
 }
 
 void
 vfs_deallocate(
-	struct vfs		*vfsp)
+	struct bhv_vfs		*vfsp)
 {
 	bhv_head_destroy(VFS_BHVHEAD(vfsp));
-	kmem_free(vfsp, sizeof(vfs_t));
+	kmem_free(vfsp, sizeof(bhv_vfs_t));
 }
 
 void
 vfs_insertops(
-	struct vfs		*vfsp,
-	struct bhv_vfsops	*vfsops)
+	struct bhv_vfs		*vfsp,
+	struct bhv_module_vfsops *vfsops)
 {
 	struct bhv_desc		*bdp;
 
@@ -276,9 +275,9 @@ vfs_insertops(
 
 void
 vfs_insertbhv(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	struct bhv_desc		*bdp,
-	struct vfsops		*vfsops,
+	struct bhv_vfsops	*vfsops,
 	void			*mount)
 {
 	bhv_desc_init(bdp, mount, vfsp, vfsops);
@@ -287,7 +286,7 @@ vfs_insertbhv(
 
 void
 bhv_remove_vfsops(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	int			pos)
 {
 	struct bhv_desc		*bhv;
@@ -301,7 +300,7 @@ bhv_remove_vfsops(
 
 void
 bhv_remove_all_vfsops(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	int			freebase)
 {
 	struct xfs_mount	*mp;
@@ -317,7 +316,7 @@ bhv_remove_all_vfsops(
 
 void
 bhv_insert_all_vfsops(
-	struct vfs		*vfsp)
+	struct bhv_vfs		*vfsp)
 {
 	struct xfs_mount	*mp;
 
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 841200c03092..91fc2c4b3353 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -21,42 +21,40 @@
 #include <linux/vfs.h>
 #include "xfs_fs.h"
 
+struct bhv_vfs;
+struct bhv_vnode;
+
 struct fid;
-struct vfs;
 struct cred;
-struct vnode;
-struct kstatfs;
 struct seq_file;
 struct super_block;
 struct xfs_mount_args;
 
-typedef struct kstatfs xfs_statfs_t;
+typedef struct kstatfs	bhv_statvfs_t;
 
-typedef struct vfs_sync_work {
+typedef struct bhv_vfs_sync_work {
 	struct list_head	w_list;
-	struct vfs		*w_vfs;
+	struct bhv_vfs		*w_vfs;
 	void			*w_data;	/* syncer routine argument */
-	void			(*w_syncer)(struct vfs *, void *);
-} vfs_sync_work_t;
+	void			(*w_syncer)(struct bhv_vfs *, void *);
+} bhv_vfs_sync_work_t;
 
-typedef struct vfs {
+typedef struct bhv_vfs {
 	u_int			vfs_flag;	/* flags */
 	xfs_fsid_t		vfs_fsid;	/* file system ID */
 	xfs_fsid_t		*vfs_altfsid;	/* An ID fixed for life of FS */
 	bhv_head_t		vfs_bh;		/* head of vfs behavior chain */
 	struct super_block	*vfs_super;	/* generic superblock pointer */
 	struct task_struct	*vfs_sync_task;	/* generalised sync thread */
-	vfs_sync_work_t		vfs_sync_work;	/* work item for VFS_SYNC */
+	bhv_vfs_sync_work_t	vfs_sync_work;	/* work item for VFS_SYNC */
 	struct list_head	vfs_sync_list;	/* sync thread work item list */
 	spinlock_t		vfs_sync_lock;	/* work item list lock */
-	int 			vfs_sync_seq;	/* sync thread generation no. */
+	int			vfs_sync_seq;	/* sync thread generation no. */
 	wait_queue_head_t	vfs_wait_single_sync_task;
-} vfs_t;
-
-#define vfs_fbhv		vfs_bh.bh_first	/* 1st on vfs behavior chain */
+} bhv_vfs_t;
 
-#define bhvtovfs(bdp)		( (struct vfs *)BHV_VOBJ(bdp) )
-#define bhvtovfsops(bdp)	( (struct vfsops *)BHV_OPS(bdp) )
+#define bhvtovfs(bdp)		( (struct bhv_vfs *)BHV_VOBJ(bdp) )
+#define bhvtovfsops(bdp)	( (struct bhv_vfsops *)BHV_OPS(bdp) )
 #define VFS_BHVHEAD(vfs)	( &(vfs)->vfs_bh )
 #define VFS_REMOVEBHV(vfs, bdp)	( bhv_remove(VFS_BHVHEAD(vfs), bdp) )
 
@@ -71,7 +69,7 @@ typedef enum {
 	VFS_BHV_QM,		/* quota manager */
 	VFS_BHV_IO,		/* IO path */
 	VFS_BHV_END		/* housekeeping end-of-range */
-} vfs_bhv_t;
+} bhv_vfs_type_t;
 
 #define VFS_POSITION_XFS	(BHV_POSITION_BASE)
 #define VFS_POSITION_DM		(VFS_POSITION_BASE+10)
@@ -81,8 +79,9 @@ typedef enum {
 #define VFS_RDONLY		0x0001	/* read-only vfs */
 #define VFS_GRPID		0x0002	/* group-ID assigned from directory */
 #define VFS_DMI			0x0004	/* filesystem has the DMI enabled */
-#define VFS_32BITINODES		0x0008	/* do not use inums above 32 bits */
-#define VFS_END			0x0008	/* max flag */
+#define VFS_UMOUNT		0x0008	/* unmount in progress */
+#define VFS_32BITINODES		0x0010	/* do not use inums above 32 bits */
+#define VFS_END			0x0010	/* max flag */
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
 #define SYNC_CLOSE		0x0002	/* close file system down */
@@ -92,7 +91,14 @@ typedef enum {
 #define SYNC_FSDATA		0x0020	/* flush fs data (e.g. superblocks) */
 #define SYNC_REFCACHE		0x0040  /* prune some of the nfs ref cache */
 #define SYNC_REMOUNT		0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_QUIESCE		0x0100  /* quiesce filesystem for a snapshot */
+#define SYNC_QUIESCE		0x0100  /* quiesce fileystem for a snapshot */
+
+#define SHUTDOWN_META_IO_ERROR	0x0001	/* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR	0x0002	/* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT	0x0004	/* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE	0x0008	/* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ	0x0010	/* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ	0x0020	/* failed all paths to the device */
 
 typedef int	(*vfs_mount_t)(bhv_desc_t *,
 				struct xfs_mount_args *, struct cred *);
@@ -102,18 +108,19 @@ typedef	int	(*vfs_showargs_t)(bhv_desc_t *, struct seq_file *);
 typedef int	(*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
 typedef int	(*vfs_mntupdate_t)(bhv_desc_t *, int *,
 				struct xfs_mount_args *);
-typedef int	(*vfs_root_t)(bhv_desc_t *, struct vnode **);
-typedef int	(*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+typedef int	(*vfs_root_t)(bhv_desc_t *, struct bhv_vnode **);
+typedef int	(*vfs_statvfs_t)(bhv_desc_t *, bhv_statvfs_t *,
+				struct bhv_vnode *);
 typedef int	(*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *);
+typedef int	(*vfs_vget_t)(bhv_desc_t *, struct bhv_vnode **, struct fid *);
 typedef int	(*vfs_dmapiops_t)(bhv_desc_t *, caddr_t);
 typedef int	(*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t);
 typedef void	(*vfs_init_vnode_t)(bhv_desc_t *,
-				struct vnode *, bhv_desc_t *, int);
+				struct bhv_vnode *, bhv_desc_t *, int);
 typedef void	(*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
 typedef void	(*vfs_freeze_t)(bhv_desc_t *);
 
-typedef struct vfsops {
+typedef struct bhv_vfsops {
 	bhv_position_t		vf_position;	/* behavior chain position */
 	vfs_mount_t		vfs_mount;	/* mount file system */
 	vfs_parseargs_t		vfs_parseargs;	/* parse mount options */
@@ -129,82 +136,82 @@ typedef struct vfsops {
 	vfs_init_vnode_t	vfs_init_vnode;	/* initialize a new vnode */
 	vfs_force_shutdown_t	vfs_force_shutdown;	/* crash and burn */
 	vfs_freeze_t		vfs_freeze;	/* freeze fs for snapshot */
-} vfsops_t;
+} bhv_vfsops_t;
 
 /*
- * VFS's.  Operates on vfs structure pointers (starts at bhv head).
+ * Virtual filesystem operations, operating from head bhv.
  */
-#define VHEAD(v)			((v)->vfs_fbhv)
-#define VFS_MOUNT(v, ma,cr, rv)		((rv) = vfs_mount(VHEAD(v), ma,cr))
-#define VFS_PARSEARGS(v, o,ma,f, rv)	((rv) = vfs_parseargs(VHEAD(v), o,ma,f))
-#define VFS_SHOWARGS(v, m, rv)		((rv) = vfs_showargs(VHEAD(v), m))
-#define VFS_UNMOUNT(v, f, cr, rv)	((rv) = vfs_unmount(VHEAD(v), f,cr))
-#define VFS_MNTUPDATE(v, fl, args, rv)	((rv) = vfs_mntupdate(VHEAD(v), fl, args))
-#define VFS_ROOT(v, vpp, rv)		((rv) = vfs_root(VHEAD(v), vpp))
-#define VFS_STATVFS(v, sp,vp, rv)	((rv) = vfs_statvfs(VHEAD(v), sp,vp))
-#define VFS_SYNC(v, flag,cr, rv)	((rv) = vfs_sync(VHEAD(v), flag,cr))
-#define VFS_VGET(v, vpp,fidp, rv)	((rv) = vfs_vget(VHEAD(v), vpp,fidp))
-#define VFS_DMAPIOPS(v, p, rv)		((rv) = vfs_dmapiops(VHEAD(v), p))
-#define VFS_QUOTACTL(v, c,id,p, rv)	((rv) = vfs_quotactl(VHEAD(v), c,id,p))
-#define VFS_INIT_VNODE(v, vp,b,ul)	( vfs_init_vnode(VHEAD(v), vp,b,ul) )
-#define VFS_FORCE_SHUTDOWN(v, fl,f,l)	( vfs_force_shutdown(VHEAD(v), fl,f,l) )
-#define VFS_FREEZE(v)			( vfs_freeze(VHEAD(v)) )
+#define VFSHEAD(v)			((v)->vfs_bh.bh_first)
+#define bhv_vfs_mount(v, ma,cr)		vfs_mount(VFSHEAD(v), ma,cr)
+#define bhv_vfs_parseargs(v, o,ma,f)	vfs_parseargs(VFSHEAD(v), o,ma,f)
+#define bhv_vfs_showargs(v, m)		vfs_showargs(VFSHEAD(v), m)
+#define bhv_vfs_unmount(v, f,cr)	vfs_unmount(VFSHEAD(v), f,cr)
+#define bhv_vfs_mntupdate(v, fl,args)	vfs_mntupdate(VFSHEAD(v), fl,args)
+#define bhv_vfs_root(v, vpp)		vfs_root(VFSHEAD(v), vpp)
+#define bhv_vfs_statvfs(v, sp,vp)	vfs_statvfs(VFSHEAD(v), sp,vp)
+#define bhv_vfs_sync(v, flag,cr)	vfs_sync(VFSHEAD(v), flag,cr)
+#define bhv_vfs_vget(v, vpp,fidp)	vfs_vget(VFSHEAD(v), vpp,fidp)
+#define bhv_vfs_dmapiops(v, p)		vfs_dmapiops(VFSHEAD(v), p)
+#define bhv_vfs_quotactl(v, c,id,p)	vfs_quotactl(VFSHEAD(v), c,id,p)
+#define bhv_vfs_init_vnode(v, vp,b,ul)	vfs_init_vnode(VFSHEAD(v), vp,b,ul)
+#define bhv_vfs_force_shutdown(v,u,f,l)	vfs_force_shutdown(VFSHEAD(v), u,f,l)
+#define bhv_vfs_freeze(v)		vfs_freeze(VFSHEAD(v))
 
 /*
- * PVFS's.  Operates on behavior descriptor pointers.
+ * Virtual filesystem operations, operating from next bhv.
  */
-#define PVFS_MOUNT(b, ma,cr, rv)	((rv) = vfs_mount(b, ma,cr))
-#define PVFS_PARSEARGS(b, o,ma,f, rv)	((rv) = vfs_parseargs(b, o,ma,f))
-#define PVFS_SHOWARGS(b, m, rv)		((rv) = vfs_showargs(b, m))
-#define PVFS_UNMOUNT(b, f,cr, rv)	((rv) = vfs_unmount(b, f,cr))
-#define PVFS_MNTUPDATE(b, fl, args, rv)	((rv) = vfs_mntupdate(b, fl, args))
-#define PVFS_ROOT(b, vpp, rv)		((rv) = vfs_root(b, vpp))
-#define PVFS_STATVFS(b, sp,vp, rv)	((rv) = vfs_statvfs(b, sp,vp))
-#define PVFS_SYNC(b, flag,cr, rv)	((rv) = vfs_sync(b, flag,cr))
-#define PVFS_VGET(b, vpp,fidp, rv)	((rv) = vfs_vget(b, vpp,fidp))
-#define PVFS_DMAPIOPS(b, p, rv)		((rv) = vfs_dmapiops(b, p))
-#define PVFS_QUOTACTL(b, c,id,p, rv)	((rv) = vfs_quotactl(b, c,id,p))
-#define PVFS_INIT_VNODE(b, vp,b2,ul)	( vfs_init_vnode(b, vp,b2,ul) )
-#define PVFS_FORCE_SHUTDOWN(b, fl,f,l)	( vfs_force_shutdown(b, fl,f,l) )
-#define PVFS_FREEZE(b)			( vfs_freeze(b) )
+#define bhv_next_vfs_mount(b, ma,cr)		vfs_mount(b, ma,cr)
+#define bhv_next_vfs_parseargs(b, o,ma,f)	vfs_parseargs(b, o,ma,f)
+#define bhv_next_vfs_showargs(b, m)		vfs_showargs(b, m)
+#define bhv_next_vfs_unmount(b, f,cr)		vfs_unmount(b, f,cr)
+#define bhv_next_vfs_mntupdate(b, fl,args)	vfs_mntupdate(b, fl, args)
+#define bhv_next_vfs_root(b, vpp)		vfs_root(b, vpp)
+#define bhv_next_vfs_statvfs(b, sp,vp)		vfs_statvfs(b, sp,vp)
+#define bhv_next_vfs_sync(b, flag,cr)		vfs_sync(b, flag,cr)
+#define bhv_next_vfs_vget(b, vpp,fidp)		vfs_vget(b, vpp,fidp)
+#define bhv_next_vfs_dmapiops(b, p)		vfs_dmapiops(b, p)
+#define bhv_next_vfs_quotactl(b, c,id,p)	vfs_quotactl(b, c,id,p)
+#define bhv_next_vfs_init_vnode(b, vp,b2,ul)	vfs_init_vnode(b, vp,b2,ul)
+#define bhv_next_force_shutdown(b, fl,f,l)	vfs_force_shutdown(b, fl,f,l)
+#define bhv_next_vfs_freeze(b)			vfs_freeze(b)
 
 extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
 extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
 extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
 extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
 extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
-extern int vfs_root(bhv_desc_t *, struct vnode **);
-extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+extern int vfs_root(bhv_desc_t *, struct bhv_vnode **);
+extern int vfs_statvfs(bhv_desc_t *, bhv_statvfs_t *, struct bhv_vnode *);
 extern int vfs_sync(bhv_desc_t *, int, struct cred *);
-extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *);
+extern int vfs_vget(bhv_desc_t *, struct bhv_vnode **, struct fid *);
 extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
 extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
-extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
+extern void vfs_init_vnode(bhv_desc_t *, struct bhv_vnode *, bhv_desc_t *, int);
 extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
 extern void vfs_freeze(bhv_desc_t *);
 
-typedef struct bhv_vfsops {
-	struct vfsops		bhv_common;
+#define vfs_test_for_freeze(vfs)	((vfs)->vfs_super->s_frozen)
+#define vfs_wait_for_freeze(vfs,l)	vfs_check_frozen((vfs)->vfs_super, (l))
+ 
+typedef struct bhv_module_vfsops {
+	struct bhv_vfsops	bhv_common;
 	void *			bhv_custom;
-} bhv_vfsops_t;
+} bhv_module_vfsops_t;
 
-#define vfs_bhv_lookup(v, id)	( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) )
-#define vfs_bhv_custom(b)	( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom )
-#define vfs_bhv_set_custom(b,o)	( (b)->bhv_custom = (void *)(o))
-#define vfs_bhv_clr_custom(b)	( (b)->bhv_custom = NULL )
+#define vfs_bhv_lookup(v, id)	(bhv_lookup_range(&(v)->vfs_bh, (id), (id)))
+#define vfs_bhv_custom(b)	(((bhv_module_vfsops_t*)BHV_OPS(b))->bhv_custom)
+#define vfs_bhv_set_custom(b,o)	((b)->bhv_custom = (void *)(o))
+#define vfs_bhv_clr_custom(b)	((b)->bhv_custom = NULL)
 
-extern vfs_t *vfs_allocate(struct super_block *);
-extern vfs_t *vfs_from_sb(struct super_block *);
-extern void vfs_deallocate(vfs_t *);
-extern void vfs_insertops(vfs_t *, bhv_vfsops_t *);
-extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *);
+extern bhv_vfs_t *vfs_allocate(struct super_block *);
+extern bhv_vfs_t *vfs_from_sb(struct super_block *);
+extern void vfs_deallocate(bhv_vfs_t *);
+extern void vfs_insertbhv(bhv_vfs_t *, bhv_desc_t *, bhv_vfsops_t *, void *);
 
-extern void bhv_insert_all_vfsops(struct vfs *);
-extern void bhv_remove_all_vfsops(struct vfs *, int);
-extern void bhv_remove_vfsops(struct vfs *, int);
+extern void vfs_insertops(bhv_vfs_t *, bhv_module_vfsops_t *);
 
-#define fs_frozen(vfsp)		((vfsp)->vfs_super->s_frozen)
-#define fs_check_frozen(vfsp, level) \
-	vfs_check_frozen(vfsp->vfs_super, level);
+extern void bhv_insert_all_vfsops(struct bhv_vfs *);
+extern void bhv_remove_all_vfsops(struct bhv_vfs *, int);
+extern void bhv_remove_vfsops(struct bhv_vfs *, int);
 
 #endif	/* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index d27c25b27ccd..6628d96b6fd6 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -39,7 +39,7 @@ vn_init(void)
 
 void
 vn_iowait(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	wait_queue_head_t *wq = vptosync(vp);
 
@@ -48,17 +48,33 @@ vn_iowait(
 
 void
 vn_iowake(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	if (atomic_dec_and_test(&vp->v_iocount))
 		wake_up(vptosync(vp));
 }
 
-struct vnode *
+/*
+ * Volume managers supporting multiple paths can send back ENODEV when the
+ * final path disappears.  In this case continuing to fill the page cache
+ * with dirty data which cannot be written out is evil, so prevent that.
+ */
+void
+vn_ioerror(
+	bhv_vnode_t	*vp,
+	int		error,
+	char		*f,
+	int		l)
+{
+	if (unlikely(error == -ENODEV))
+		bhv_vfs_force_shutdown(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l);
+}
+
+bhv_vnode_t *
 vn_initialize(
 	struct inode	*inode)
 {
-	struct vnode	*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	XFS_STATS_INC(vn_active);
 	XFS_STATS_INC(vn_alloc);
@@ -94,8 +110,8 @@ vn_initialize(
  */
 void
 vn_revalidate_core(
-	struct vnode	*vp,
-	vattr_t		*vap)
+	bhv_vnode_t	*vp,
+	bhv_vattr_t	*vap)
 {
 	struct inode	*inode = vn_to_inode(vp);
 
@@ -130,14 +146,14 @@ vn_revalidate_core(
  */
 int
 __vn_revalidate(
-	struct vnode	*vp,
-	struct vattr	*vattr)
+	bhv_vnode_t	*vp,
+	bhv_vattr_t	*vattr)
 {
 	int		error;
 
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 	vattr->va_mask = XFS_AT_STAT | XFS_AT_XFLAGS;
-	VOP_GETATTR(vp, vattr, 0, NULL, error);
+	error = bhv_vop_getattr(vp, vattr, 0, NULL);
 	if (likely(!error)) {
 		vn_revalidate_core(vp, vattr);
 		VUNMODIFY(vp);
@@ -147,9 +163,9 @@ __vn_revalidate(
 
 int
 vn_revalidate(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
-	vattr_t		vattr;
+	bhv_vattr_t	vattr;
 
 	return __vn_revalidate(vp, &vattr);
 }
@@ -157,9 +173,9 @@ vn_revalidate(
 /*
  * Add a reference to a referenced vnode.
  */
-struct vnode *
+bhv_vnode_t *
 vn_hold(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	struct inode	*inode;
 
@@ -192,31 +208,31 @@ vn_hold(
  * Vnode tracing code.
  */
 void
-vn_trace_entry(vnode_t *vp, const char *func, inst_t *ra)
+vn_trace_entry(bhv_vnode_t *vp, const char *func, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
 }
 
 void
-vn_trace_exit(vnode_t *vp, const char *func, inst_t *ra)
+vn_trace_exit(bhv_vnode_t *vp, const char *func, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
 }
 
 void
-vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_hold(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
 }
 
 void
-vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_ref(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
 }
 
 void
-vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_rele(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
 }
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 2a8e16c22353..35c6a01963a7 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -14,57 +14,35 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Portions Copyright (c) 1989, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
  */
 #ifndef __XFS_VNODE_H__
 #define __XFS_VNODE_H__
 
 struct uio;
 struct file;
-struct vattr;
+struct bhv_vfs;
+struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
+typedef struct dentry	bhv_vname_t;
+typedef __u64		bhv_vnumber_t;
 
-typedef xfs_ino_t vnumber_t;
-typedef struct dentry vname_t;
-typedef bhv_head_t vn_bhv_head_t;
+typedef enum bhv_vflags {
+	VMODIFIED	= 0x08,	/* XFS inode state possibly differs */
+				/* to the Linux inode state. */
+	VTRUNCATED	= 0x40,	/* truncated down so flush-on-close */
+} bhv_vflags_t;
 
 /*
  * MP locking protocols:
  *	v_flag, v_vfsp				VN_LOCK/VN_UNLOCK
  */
-typedef struct vnode {
-	__u32		v_flag;			/* vnode flags (see below) */
-	struct vfs	*v_vfsp;		/* ptr to containing VFS */
-	vnumber_t	v_number;		/* in-core vnode number */
-	vn_bhv_head_t	v_bh;			/* behavior head */
+typedef struct bhv_vnode {
+	bhv_vflags_t	v_flag;			/* vnode flags (see above) */
+	bhv_vfs_t	*v_vfsp;		/* ptr to containing VFS */
+	bhv_vnumber_t	v_number;		/* in-core vnode number */
+	bhv_head_t	v_bh;			/* behavior head */
 	spinlock_t	v_lock;			/* VN_LOCK/VN_UNLOCK */
 	atomic_t	v_iocount;		/* outstanding I/O count */
 #ifdef XFS_VNODE_TRACE
@@ -72,7 +50,7 @@ typedef struct vnode {
 #endif
 	struct inode	v_inode;		/* Linux inode */
 	/* inode MUST be last */
-} vnode_t;
+} bhv_vnode_t;
 
 #define VN_ISLNK(vp)	S_ISLNK((vp)->v_inode.i_mode)
 #define VN_ISREG(vp)	S_ISREG((vp)->v_inode.i_mode)
@@ -80,9 +58,6 @@ typedef struct vnode {
 #define VN_ISCHR(vp)	S_ISCHR((vp)->v_inode.i_mode)
 #define VN_ISBLK(vp)	S_ISBLK((vp)->v_inode.i_mode)
 
-#define v_fbhv			v_bh.bh_first	       /* first behavior */
-#define v_fops			v_bh.bh_first->bd_ops  /* first behavior ops */
-
 #define VNODE_POSITION_BASE	BHV_POSITION_BASE	/* chain bottom */
 #define VNODE_POSITION_TOP	BHV_POSITION_TOP	/* chain top */
 #define VNODE_POSITION_INVALID	BHV_POSITION_INVALID	/* invalid pos. num */
@@ -104,8 +79,8 @@ typedef enum {
 /*
  * Macros for dealing with the behavior descriptor inside of the vnode.
  */
-#define BHV_TO_VNODE(bdp)	((vnode_t *)BHV_VOBJ(bdp))
-#define BHV_TO_VNODE_NULL(bdp)	((vnode_t *)BHV_VOBJNULL(bdp))
+#define BHV_TO_VNODE(bdp)	((bhv_vnode_t *)BHV_VOBJ(bdp))
+#define BHV_TO_VNODE_NULL(bdp)	((bhv_vnode_t *)BHV_VOBJNULL(bdp))
 
 #define VN_BHV_HEAD(vp)			((bhv_head_t *)(&((vp)->v_bh)))
 #define vn_bhv_head_init(bhp,name)	bhv_head_init(bhp,name)
@@ -116,35 +91,29 @@ typedef enum {
 /*
  * Vnode to Linux inode mapping.
  */
-static inline struct vnode *vn_from_inode(struct inode *inode)
+static inline struct bhv_vnode *vn_from_inode(struct inode *inode)
 {
-	return (vnode_t *)list_entry(inode, vnode_t, v_inode);
+	return (bhv_vnode_t *)list_entry(inode, bhv_vnode_t, v_inode);
 }
-static inline struct inode *vn_to_inode(struct vnode *vnode)
+static inline struct inode *vn_to_inode(struct bhv_vnode *vnode)
 {
 	return &vnode->v_inode;
 }
 
 /*
- * Vnode flags.
- */
-#define VMODIFIED	       0x8	/* XFS inode state possibly differs */
-					/* to the Linux inode state.	*/
-
-/*
- * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
+ * Values for the vop_rwlock/rwunlock flags parameter.
  */
-typedef enum vrwlock {
+typedef enum bhv_vrwlock {
 	VRWLOCK_NONE,
 	VRWLOCK_READ,
 	VRWLOCK_WRITE,
 	VRWLOCK_WRITE_DIRECT,
 	VRWLOCK_TRY_READ,
 	VRWLOCK_TRY_WRITE
-} vrwlock_t;
+} bhv_vrwlock_t;
 
 /*
- * Return values for VOP_INACTIVE.  A return value of
+ * Return values for bhv_vop_inactive.  A return value of
  * VN_INACTIVE_NOCACHE implies that the file system behavior
  * has disassociated its state and bhv_desc_t from the vnode.
  */
@@ -152,18 +121,20 @@ typedef enum vrwlock {
 #define	VN_INACTIVE_NOCACHE	1
 
 /*
- * Values for the cmd code given to VOP_VNODE_CHANGE.
+ * Values for the cmd code given to vop_vnode_change.
  */
-typedef enum vchange {
+typedef enum bhv_vchange {
 	VCHANGE_FLAGS_FRLOCKS		= 0,
 	VCHANGE_FLAGS_ENF_LOCKING	= 1,
 	VCHANGE_FLAGS_TRUNCATED		= 2,
 	VCHANGE_FLAGS_PAGE_DIRTY	= 3,
 	VCHANGE_FLAGS_IOEXCL_COUNT	= 4
-} vchange_t;
+} bhv_vchange_t;
 
+typedef enum { L_FALSE, L_TRUE } lastclose_t;
 
 typedef int	(*vop_open_t)(bhv_desc_t *, struct cred *);
+typedef int	(*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *);
 typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
 				const struct iovec *, unsigned int,
 				loff_t *, int, struct cred *);
@@ -181,27 +152,27 @@ typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
 				struct cred *);
 typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
 				int, unsigned int, void __user *);
-typedef int	(*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
+typedef int	(*vop_getattr_t)(bhv_desc_t *, struct bhv_vattr *, int,
 				struct cred *);
-typedef int	(*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
+typedef int	(*vop_setattr_t)(bhv_desc_t *, struct bhv_vattr *, int,
 				struct cred *);
 typedef int	(*vop_access_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **,
-				int, vnode_t *, struct cred *);
-typedef int	(*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				vnode_t **, struct cred *);
-typedef int	(*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
-typedef int	(*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *,
-				struct cred *);
-typedef int	(*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *,
+typedef int	(*vop_lookup_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t **,
+				int, bhv_vnode_t *, struct cred *);
+typedef int	(*vop_create_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *,
+				bhv_vnode_t **, struct cred *);
+typedef int	(*vop_remove_t)(bhv_desc_t *, bhv_vname_t *, struct cred *);
+typedef int	(*vop_link_t)(bhv_desc_t *, bhv_vnode_t *, bhv_vname_t *,
 				struct cred *);
-typedef int	(*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				vnode_t **, struct cred *);
-typedef int	(*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
+typedef int	(*vop_rename_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *,
+				bhv_vname_t *, struct cred *);
+typedef int	(*vop_mkdir_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *,
+				bhv_vnode_t **, struct cred *);
+typedef int	(*vop_rmdir_t)(bhv_desc_t *, bhv_vname_t *, struct cred *);
 typedef int	(*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
 				int *);
-typedef int	(*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				char *, vnode_t **, struct cred *);
+typedef int	(*vop_symlink_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr*,
+				char *, bhv_vnode_t **, struct cred *);
 typedef int	(*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
 				struct cred *);
 typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
@@ -209,8 +180,8 @@ typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
 typedef int	(*vop_inactive_t)(bhv_desc_t *, struct cred *);
 typedef int	(*vop_fid2_t)(bhv_desc_t *, struct fid *);
 typedef int	(*vop_release_t)(bhv_desc_t *);
-typedef int	(*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
-typedef void	(*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
+typedef int	(*vop_rwlock_t)(bhv_desc_t *, bhv_vrwlock_t);
+typedef void	(*vop_rwunlock_t)(bhv_desc_t *, bhv_vrwlock_t);
 typedef int	(*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int,
 				struct xfs_iomap *, int *);
 typedef int	(*vop_reclaim_t)(bhv_desc_t *);
@@ -222,8 +193,8 @@ typedef	int	(*vop_attr_remove_t)(bhv_desc_t *, const char *,
 				int, struct cred *);
 typedef	int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
 				struct attrlist_cursor_kern *, struct cred *);
-typedef void	(*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
-typedef void	(*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
+typedef void	(*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int);
+typedef void	(*vop_vnode_change_t)(bhv_desc_t *, bhv_vchange_t, __psint_t);
 typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
@@ -231,9 +202,10 @@ typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
 typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
 
 
-typedef struct vnodeops {
+typedef struct bhv_vnodeops {
 	bhv_position_t  vn_position;    /* position within behavior chain */
 	vop_open_t		vop_open;
+	vop_close_t		vop_close;
 	vop_read_t		vop_read;
 	vop_write_t		vop_write;
 	vop_sendfile_t		vop_sendfile;
@@ -271,103 +243,80 @@ typedef struct vnodeops {
 	vop_pflushvp_t		vop_flush_pages;
 	vop_release_t		vop_release;
 	vop_iflush_t		vop_iflush;
-} vnodeops_t;
+} bhv_vnodeops_t;
 
 /*
- * VOP's.
- */
-#define _VOP_(op, vp)	(*((vnodeops_t *)(vp)->v_fops)->op)
-
-#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv)			\
-	rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv)		\
-	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)		\
-	rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_BMAP(vp,of,sz,rw,b,n,rv)					\
-	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
-#define VOP_OPEN(vp, cr, rv)						\
-	rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
-#define VOP_GETATTR(vp, vap, f, cr, rv)					\
-	rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define	VOP_SETATTR(vp, vap, f, cr, rv)					\
-	rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define	VOP_ACCESS(vp, mode, cr, rv)					\
-	rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr)
-#define	VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv)				\
-	rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr)
-#define VOP_CREATE(dvp,d,vap,vpp,cr,rv)					\
-	rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr)
-#define VOP_REMOVE(dvp,d,cr,rv)						\
-	rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr)
-#define	VOP_LINK(tdvp,fvp,d,cr,rv)					\
-	rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr)
-#define	VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv)				\
-	rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr)
-#define	VOP_MKDIR(dp,d,vap,vpp,cr,rv)					\
-	rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr)
-#define	VOP_RMDIR(dp,d,cr,rv)	 					\
-	rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr)
-#define	VOP_READDIR(vp,uiop,cr,eofp,rv)					\
-	rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp)
-#define	VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv)				\
-	rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr)
-#define	VOP_READLINK(vp,uiop,fl,cr,rv)					\
-	rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr)
-#define	VOP_FSYNC(vp,f,cr,b,e,rv)					\
-	rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e)
-#define VOP_INACTIVE(vp, cr, rv)					\
-	rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr)
-#define VOP_RELEASE(vp, rv)						\
-	rv = _VOP_(vop_release, vp)((vp)->v_fbhv)
-#define VOP_FID2(vp, fidp, rv)						\
-	rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp)
-#define VOP_RWLOCK(vp,i)						\
-	(void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWLOCK_TRY(vp,i)						\
-	_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWUNLOCK(vp,i)						\
-	(void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i)
-#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv)				\
-	rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr)
-#define VOP_RECLAIM(vp, rv)						\
-	rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv)
-#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv)		\
-	rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred)
-#define	VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv)		\
-	rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred)
-#define	VOP_ATTR_REMOVE(vp, name, flags, cred, rv)			\
-	rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred)
-#define	VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv)		\
-	rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred)
-#define VOP_LINK_REMOVED(vp, dvp, linkzero)				\
-	(void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero)
-#define VOP_VNODE_CHANGE(vp, cmd, val)					\
-	(void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val)
-/*
- * These are page cache functions that now go thru VOPs.
- * 'last' parameter is unused and left in for IRIX compatibility
+ * Virtual node operations, operating from head bhv.
  */
-#define VOP_TOSS_PAGES(vp, first, last, fiopt)				\
-	_VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt)			\
-	_VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv)		\
-	rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt)
-#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv)			\
-	rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg)
-#define VOP_IFLUSH(vp, flags, rv)					\
-	rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags)
+#define VNHEAD(vp)	((vp)->v_bh.bh_first)
+#define VOP(op, vp)	(*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op)
+#define bhv_vop_open(vp, cr)		VOP(vop_open, vp)(VNHEAD(vp),cr)
+#define bhv_vop_close(vp, f,last,cr)	VOP(vop_close, vp)(VNHEAD(vp),f,last,cr)
+#define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr)		\
+		VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
+#define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr)		\
+		VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
+#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr)		\
+		VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr)
+#define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr)			\
+		VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
+#define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr)		\
+		VOP(vop_splice_write, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
+#define bhv_vop_bmap(vp,of,sz,rw,b,n)					\
+		VOP(vop_bmap, vp)(VNHEAD(vp),of,sz,rw,b,n)
+#define bhv_vop_getattr(vp, vap,f,cr)					\
+		VOP(vop_getattr, vp)(VNHEAD(vp), vap,f,cr)
+#define	bhv_vop_setattr(vp, vap,f,cr)					\
+		VOP(vop_setattr, vp)(VNHEAD(vp), vap,f,cr)
+#define	bhv_vop_access(vp, mode,cr)	VOP(vop_access, vp)(VNHEAD(vp), mode,cr)
+#define	bhv_vop_lookup(vp,d,vpp,f,rdir,cr)				\
+		VOP(vop_lookup, vp)(VNHEAD(vp),d,vpp,f,rdir,cr)
+#define bhv_vop_create(dvp,d,vap,vpp,cr)				\
+		VOP(vop_create, dvp)(VNHEAD(dvp),d,vap,vpp,cr)
+#define bhv_vop_remove(dvp,d,cr)	VOP(vop_remove, dvp)(VNHEAD(dvp),d,cr)
+#define	bhv_vop_link(dvp,fvp,d,cr)	VOP(vop_link, dvp)(VNHEAD(dvp),fvp,d,cr)
+#define	bhv_vop_rename(fvp,fnm,tdvp,tnm,cr)				\
+		VOP(vop_rename, fvp)(VNHEAD(fvp),fnm,tdvp,tnm,cr)
+#define	bhv_vop_mkdir(dp,d,vap,vpp,cr)					\
+		VOP(vop_mkdir, dp)(VNHEAD(dp),d,vap,vpp,cr)
+#define	bhv_vop_rmdir(dp,d,cr)	 	VOP(vop_rmdir, dp)(VNHEAD(dp),d,cr)
+#define	bhv_vop_readdir(vp,uiop,cr,eofp)				\
+		VOP(vop_readdir, vp)(VNHEAD(vp),uiop,cr,eofp)
+#define	bhv_vop_symlink(dvp,d,vap,tnm,vpp,cr)				\
+		VOP(vop_symlink, dvp)(VNHEAD(dvp),d,vap,tnm,vpp,cr)
+#define	bhv_vop_readlink(vp,uiop,fl,cr)					\
+		VOP(vop_readlink, vp)(VNHEAD(vp),uiop,fl,cr)
+#define	bhv_vop_fsync(vp,f,cr,b,e)	VOP(vop_fsync, vp)(VNHEAD(vp),f,cr,b,e)
+#define bhv_vop_inactive(vp,cr)		VOP(vop_inactive, vp)(VNHEAD(vp),cr)
+#define bhv_vop_release(vp)		VOP(vop_release, vp)(VNHEAD(vp))
+#define bhv_vop_fid2(vp,fidp)		VOP(vop_fid2, vp)(VNHEAD(vp),fidp)
+#define bhv_vop_rwlock(vp,i)		VOP(vop_rwlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_rwlock_try(vp,i)	VOP(vop_rwlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_rwunlock(vp,i)		VOP(vop_rwunlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_frlock(vp,c,fl,flags,offset,fr)				\
+		VOP(vop_frlock, vp)(VNHEAD(vp),c,fl,flags,offset,fr)
+#define bhv_vop_reclaim(vp)		VOP(vop_reclaim, vp)(VNHEAD(vp))
+#define bhv_vop_attr_get(vp, name, val, vallenp, fl, cred)		\
+		VOP(vop_attr_get, vp)(VNHEAD(vp),name,val,vallenp,fl,cred)
+#define	bhv_vop_attr_set(vp, name, val, vallen, fl, cred)		\
+		VOP(vop_attr_set, vp)(VNHEAD(vp),name,val,vallen,fl,cred)
+#define	bhv_vop_attr_remove(vp, name, flags, cred)			\
+		VOP(vop_attr_remove, vp)(VNHEAD(vp),name,flags,cred)
+#define	bhv_vop_attr_list(vp, buf, buflen, fl, cursor, cred)		\
+		VOP(vop_attr_list, vp)(VNHEAD(vp),buf,buflen,fl,cursor,cred)
+#define bhv_vop_link_removed(vp, dvp, linkzero)				\
+		VOP(vop_link_removed, vp)(VNHEAD(vp), dvp, linkzero)
+#define bhv_vop_vnode_change(vp, cmd, val)				\
+		VOP(vop_vnode_change, vp)(VNHEAD(vp), cmd, val)
+#define bhv_vop_toss_pages(vp, first, last, fiopt)			\
+		VOP(vop_tosspages, vp)(VNHEAD(vp), first, last, fiopt)
+#define bhv_vop_flushinval_pages(vp, first, last, fiopt)		\
+		VOP(vop_flushinval_pages, vp)(VNHEAD(vp),first,last,fiopt)
+#define bhv_vop_flush_pages(vp, first, last, flags, fiopt)		\
+		VOP(vop_flush_pages, vp)(VNHEAD(vp),first,last,flags,fiopt)
+#define bhv_vop_ioctl(vp, inode, filp, fl, cmd, arg)			\
+		VOP(vop_ioctl, vp)(VNHEAD(vp),inode,filp,fl,cmd,arg)
+#define bhv_vop_iflush(vp, flags)	VOP(vop_iflush, vp)(VNHEAD(vp), flags)
 
 /*
  * Flags for read/write calls - same values as IRIX
@@ -377,7 +326,7 @@ typedef struct vnodeops {
 #define IO_INVIS	0x00020		/* don't update inode timestamps */
 
 /*
- * Flags for VOP_IFLUSH call
+ * Flags for vop_iflush call
  */
 #define FLUSH_SYNC		1	/* wait for flush to complete	*/
 #define FLUSH_INODE		2	/* flush the inode itself	*/
@@ -385,8 +334,7 @@ typedef struct vnodeops {
 					 * this inode out to disk	*/
 
 /*
- * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
- *	VOP_FLUSH_PAGES.
+ * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
  */
 #define FI_NONE			0	/* none */
 #define FI_REMAPF		1	/* Do a remapf prior to the operation */
@@ -398,7 +346,7 @@ typedef struct vnodeops {
  * Vnode attributes.  va_mask indicates those attributes the caller
  * wants to set or extract.
  */
-typedef struct vattr {
+typedef struct bhv_vattr {
 	int		va_mask;	/* bit-mask of attributes present */
 	mode_t		va_mode;	/* file access mode and type */
 	xfs_nlink_t	va_nlink;	/* number of references to file */
@@ -418,7 +366,7 @@ typedef struct vattr {
 	u_long		va_nextents;	/* number of extents in file */
 	u_long		va_anextents;	/* number of attr extents in file */
 	prid_t		va_projid;	/* project id */
-} vattr_t;
+} bhv_vattr_t;
 
 /*
  * setattr or getattr attributes
@@ -492,29 +440,17 @@ typedef struct vattr {
 	(VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
 
 extern void	vn_init(void);
-extern vnode_t	*vn_initialize(struct inode *);
-
-/*
- * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
- */
-typedef struct vnode_map {
-	vfs_t		*v_vfsp;
-	vnumber_t	v_number;		/* in-core vnode number */
-	xfs_ino_t	v_ino;			/* inode #	*/
-} vmap_t;
-
-#define VMAP(vp, vmap)	{(vmap).v_vfsp	 = (vp)->v_vfsp,	\
-			 (vmap).v_number = (vp)->v_number,	\
-			 (vmap).v_ino	 = (vp)->v_inode.i_ino; }
+extern bhv_vnode_t	*vn_initialize(struct inode *);
+extern int	vn_revalidate(struct bhv_vnode *);
+extern int	__vn_revalidate(struct bhv_vnode *, bhv_vattr_t *);
+extern void	vn_revalidate_core(struct bhv_vnode *, bhv_vattr_t *);
 
-extern int	vn_revalidate(struct vnode *);
-extern int	__vn_revalidate(struct vnode *, vattr_t *);
-extern void	vn_revalidate_core(struct vnode *, vattr_t *);
+extern void	vn_iowait(struct bhv_vnode *vp);
+extern void	vn_iowake(struct bhv_vnode *vp);
 
-extern void	vn_iowait(struct vnode *vp);
-extern void	vn_iowake(struct vnode *vp);
+extern void	vn_ioerror(struct bhv_vnode *vp, int error, char *f, int l);
 
-static inline int vn_count(struct vnode *vp)
+static inline int vn_count(struct bhv_vnode *vp)
 {
 	return atomic_read(&vn_to_inode(vp)->i_count);
 }
@@ -522,7 +458,7 @@ static inline int vn_count(struct vnode *vp)
 /*
  * Vnode reference counting functions (and macros for compatibility).
  */
-extern vnode_t	*vn_hold(struct vnode *);
+extern bhv_vnode_t	*vn_hold(struct bhv_vnode *);
 
 #if defined(XFS_VNODE_TRACE)
 #define VN_HOLD(vp)		\
@@ -536,7 +472,7 @@ extern vnode_t	*vn_hold(struct vnode *);
 #define VN_RELE(vp)		(iput(vn_to_inode(vp)))
 #endif
 
-static inline struct vnode *vn_grab(struct vnode *vp)
+static inline struct bhv_vnode *vn_grab(struct bhv_vnode *vp)
 {
 	struct inode *inode = igrab(vn_to_inode(vp));
 	return inode ? vn_from_inode(inode) : NULL;
@@ -554,32 +490,39 @@ static inline struct vnode *vn_grab(struct vnode *vp)
  */
 #define VN_LOCK(vp)		mutex_spinlock(&(vp)->v_lock)
 #define VN_UNLOCK(vp, s)	mutex_spinunlock(&(vp)->v_lock, s)
-#define VN_FLAGSET(vp,b)	vn_flagset(vp,b)
-#define VN_FLAGCLR(vp,b)	vn_flagclr(vp,b)
 
-static __inline__ void vn_flagset(struct vnode *vp, uint flag)
+static __inline__ void vn_flagset(struct bhv_vnode *vp, uint flag)
 {
 	spin_lock(&vp->v_lock);
 	vp->v_flag |= flag;
 	spin_unlock(&vp->v_lock);
 }
 
-static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
+static __inline__ uint vn_flagclr(struct bhv_vnode *vp, uint flag)
 {
+	uint	cleared;
+
 	spin_lock(&vp->v_lock);
+	cleared = (vp->v_flag & flag);
 	vp->v_flag &= ~flag;
 	spin_unlock(&vp->v_lock);
+	return cleared;
 }
 
+#define VMODIFY(vp)	vn_flagset(vp, VMODIFIED)
+#define VUNMODIFY(vp)	vn_flagclr(vp, VMODIFIED)
+#define VTRUNCATE(vp)	vn_flagset(vp, VTRUNCATED)
+#define VUNTRUNCATE(vp)	vn_flagclr(vp, VTRUNCATED)
+
 /*
  * Dealing with bad inodes
  */
-static inline void vn_mark_bad(struct vnode *vp)
+static inline void vn_mark_bad(struct bhv_vnode *vp)
 {
 	make_bad_inode(vn_to_inode(vp));
 }
 
-static inline int VN_BAD(struct vnode *vp)
+static inline int VN_BAD(struct bhv_vnode *vp)
 {
 	return is_bad_inode(vn_to_inode(vp));
 }
@@ -587,18 +530,18 @@ static inline int VN_BAD(struct vnode *vp)
 /*
  * Extracting atime values in various formats
  */
-static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
 {
 	bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
 	bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
 }
 
-static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
 {
 	*ts = vp->v_inode.i_atime;
 }
 
-static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
+static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 {
 	*tt = vp->v_inode.i_atime.tv_sec;
 }
@@ -610,11 +553,10 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define VN_CACHED(vp)	(vn_to_inode(vp)->i_mapping->nrpages)
 #define VN_DIRTY(vp)	mapping_tagged(vn_to_inode(vp)->i_mapping, \
 					PAGECACHE_TAG_DIRTY)
-#define VMODIFY(vp)	VN_FLAGSET(vp, VMODIFIED)
-#define VUNMODIFY(vp)	VN_FLAGCLR(vp, VMODIFIED)
+#define VN_TRUNC(vp)	((vp)->v_flag & VTRUNCATED)
 
 /*
- * Flags to VOP_SETATTR/VOP_GETATTR.
+ * Flags to vop_setattr/getattr.
  */
 #define	ATTR_UTIME	0x01	/* non-default utime(2) request */
 #define	ATTR_DMI	0x08	/* invocation from a DMI function */
@@ -624,7 +566,7 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define ATTR_NOSIZETOK	0x400	/* Don't get the SIZE token */
 
 /*
- * Flags to VOP_FSYNC and VOP_RECLAIM.
+ * Flags to vop_fsync/reclaim.
  */
 #define FSYNC_NOWAIT	0	/* asynchronous flush */
 #define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
@@ -643,11 +585,11 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define	VNODE_KTRACE_REF	4
 #define	VNODE_KTRACE_RELE	5
 
-extern void vn_trace_entry(struct vnode *, const char *, inst_t *);
-extern void vn_trace_exit(struct vnode *, const char *, inst_t *);
-extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_entry(struct bhv_vnode *, const char *, inst_t *);
+extern void vn_trace_exit(struct bhv_vnode *, const char *, inst_t *);
+extern void vn_trace_hold(struct bhv_vnode *, char *, int, inst_t *);
+extern void vn_trace_ref(struct bhv_vnode *, char *, int, inst_t *);
+extern void vn_trace_rele(struct bhv_vnode *, char *, int, inst_t *);
 
 #define	VN_TRACE(vp)		\
 	vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 772ac48329ea..3aa771531856 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -444,7 +442,7 @@ xfs_qm_dqalloc(
 			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
 			      &firstblock,
 			      XFS_QM_DQALLOC_SPACE_RES(mp),
-			      &map, &nmaps, &flist))) {
+			      &map, &nmaps, &flist, NULL))) {
 		goto error0;
 	}
 	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -559,7 +557,7 @@ xfs_qm_dqtobp(
 		error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
 				  XFS_DQUOT_CLUSTER_SIZE_FSB,
 				  XFS_BMAPI_METADATA,
-				  NULL, 0, &map, &nmaps, NULL);
+				  NULL, 0, &map, &nmaps, NULL, NULL);
 
 		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
 		if (error)
@@ -1261,7 +1259,7 @@ xfs_qm_dqflush(
 
 	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
 			   0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
-		xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
+		xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
 		return XFS_ERROR(EIO);
 	}
 
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index c0c629663a5c..78d3ab95c5fd 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -119,7 +119,7 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
  */
 #define xfs_dqflock(dqp)	 { psema(&((dqp)->q_flock), PINOD | PRECALC);\
 				   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
-#define xfs_dqfunlock(dqp)	 { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
+#define xfs_dqfunlock(dqp)	 { ASSERT(issemalocked(&((dqp)->q_flock))); \
 				   vsema(&((dqp)->q_flock)); \
 				   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
 
@@ -128,7 +128,7 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 #define XFS_DQ_PINUNLOCK(dqp, s)   mutex_spinunlock( \
 				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
 
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
+#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 546f48af882a..5b2dcc58b244 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -248,7 +246,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	 * inode flush completed and the inode was taken off the AIL.
 	 * So, just get out.
 	 */
-	if ((valusema(&(dqp->q_flock)) > 0)  ||
+	if (!issemalocked(&(dqp->q_flock))  ||
 	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		qip->qli_pushbuf_flag = 0;
 		xfs_dqunlock(dqp);
@@ -261,7 +259,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
 			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-				  (valusema(&(dqp->q_flock)) <= 0));
+				  issemalocked(&(dqp->q_flock)));
 			qip->qli_pushbuf_flag = 0;
 			xfs_dqunlock(dqp);
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7fb5eca9bd50..e23e45535c48 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1603,7 +1601,7 @@ xfs_qm_dqiterate(
 				  maxlblkcnt - lblkno,
 				  XFS_BMAPI_METADATA,
 				  NULL,
-				  0, map, &nmaps, NULL);
+				  0, map, &nmaps, NULL, NULL);
 		xfs_iunlock(qip, XFS_ILOCK_SHARED);
 		if (error)
 			break;
@@ -1905,9 +1903,7 @@ xfs_qm_quotacheck(
 		 */
 		if ((error = xfs_bulkstat(mp, &lastino, &count,
 				     xfs_qm_dqusage_adjust, NULL,
-				     structsz, NULL,
-				     BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
-				     &done)))
+				     structsz, NULL, BULKSTAT_FG_IGET, &done)))
 			break;
 
 	} while (! done);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 6838b36d95a9..e95e99f7168f 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -129,7 +127,7 @@ xfs_qm_parseargs(
 		return XFS_ERROR(EINVAL);
 	}
 
-	PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error);
+	error = bhv_next_vfs_parseargs(BHV_NEXT(bhv), options, args, update);
 	if (!error && !referenced)
 		bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM);
 	return error;
@@ -140,9 +138,8 @@ xfs_qm_showargs(
 	struct bhv_desc		*bhv,
 	struct seq_file		*m)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhv);
 	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
-	int			error;
 
 	if (mp->m_qflags & XFS_UQUOTA_ACCT) {
 		(mp->m_qflags & XFS_UQUOTA_ENFD) ?
@@ -165,8 +162,7 @@ xfs_qm_showargs(
 	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
 		seq_puts(m, "," MNTOPT_NOQUOTA);
 
-	PVFS_SHOWARGS(BHV_NEXT(bhv), m, error);
-	return error;
+	return bhv_next_vfs_showargs(BHV_NEXT(bhv), m);
 }
 
 STATIC int
@@ -175,14 +171,67 @@ xfs_qm_mount(
 	struct xfs_mount_args	*args,
 	struct cred		*cr)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhv);
 	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
-	int			error;
 
 	if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA))
 		xfs_qm_mount_quotainit(mp, args->flags);
-	PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error);
-	return error;
+	return bhv_next_vfs_mount(BHV_NEXT(bhv), args, cr);
+}
+
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+STATIC int
+xfs_qm_statvfs(
+	struct bhv_desc		*bhv,
+	bhv_statvfs_t		*statp,
+	struct bhv_vnode	*vnode)
+{
+	xfs_mount_t		*mp;
+	xfs_inode_t		*ip;
+	xfs_dquot_t		*dqp;
+	xfs_disk_dquot_t	*dp;
+	__uint64_t		limit;
+	int			error;
+
+	error = bhv_next_vfs_statvfs(BHV_NEXT(bhv), statp, vnode);
+	if (error || !vnode)
+		return error;
+
+	mp = XFS_BHVTOM(bhv);
+	ip = xfs_vtoi(vnode);
+
+	if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
+		return 0;
+	if (!(mp->m_qflags & XFS_PQUOTA_ACCT))
+		return 0;
+	if (!(mp->m_qflags & XFS_OQUOTA_ENFD))
+		return 0;
+
+	if (xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp))
+		return 0;
+	dp = &dqp->q_core;
+
+	limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit;
+	if (limit && statp->f_blocks > limit) {
+		statp->f_blocks = limit;
+		statp->f_bfree = (statp->f_blocks > dp->d_bcount) ?
+					(statp->f_blocks - dp->d_bcount) : 0;
+	}
+	limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit;
+	if (limit && statp->f_files > limit) {
+		statp->f_files = limit;
+		statp->f_ffree = (statp->f_files > dp->d_icount) ?
+					(statp->f_ffree - dp->d_icount) : 0;
+	}
+
+	xfs_qm_dqput(dqp);
+	return 0;
 }
 
 STATIC int
@@ -191,7 +240,7 @@ xfs_qm_syncall(
 	int			flags,
 	cred_t			*credp)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhv);
 	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
 	int			error;
 
@@ -210,8 +259,7 @@ xfs_qm_syncall(
 			}
 		}
 	}
-	PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error);
-	return error;
+	return bhv_next_vfs_sync(BHV_NEXT(bhv), flags, credp);
 }
 
 STATIC int
@@ -346,11 +394,12 @@ STATIC struct xfs_qmops xfs_qmcore_xfs = {
 	.xfs_dqtrxops		= &xfs_trans_dquot_ops,
 };
 
-struct bhv_vfsops xfs_qmops = { {
+struct bhv_module_vfsops xfs_qmops = { {
 	BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM),
 	.vfs_parseargs		= xfs_qm_parseargs,
 	.vfs_showargs		= xfs_qm_showargs,
 	.vfs_mount		= xfs_qm_mount,
+	.vfs_statvfs		= xfs_qm_statvfs,
 	.vfs_sync		= xfs_qm_syncall,
 	.vfs_quotactl		= xfs_qm_quotactl, },
 };
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 0570f7733550..6f858fb81a36 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c55db463bbf2..ed620c4d1594 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,7 +26,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -35,7 +34,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -91,8 +89,8 @@ xfs_qm_quotactl(
 	xfs_caddr_t	addr)
 {
 	xfs_mount_t	*mp;
+	bhv_vfs_t	*vfsp;
 	int		error;
-	struct vfs	*vfsp;
 
 	vfsp = bhvtovfs(bdp);
 	mp = XFS_VFSTOM(vfsp);
@@ -1035,7 +1033,7 @@ xfs_qm_dqrele_all_inodes(
 {
 	xfs_inode_t	*ip, *topino;
 	uint		ireclaims;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	boolean_t	vnode_refd;
 
 	ASSERT(mp->m_quotainfo);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 9168918db252..0242e9666e8e 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index b08b3d9345b7..36fbeccdc722 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -47,7 +47,7 @@ cmn_err(register int level, char *fmt, ...)
 	va_start(ap, fmt);
 	if (*fmt == '!') fp++;
 	len = vsprintf(message, fp, ap);
-	if (message[len-1] != '\n')
+	if (level != CE_DEBUG && message[len-1] != '\n')
 		strcat(message, "\n");
 	printk("%s%s", err_level[level], message);
 	va_end(ap);
@@ -68,7 +68,7 @@ icmn_err(register int level, char *fmt, va_list ap)
 		level = XFS_MAX_ERR_LEVEL;
 	spin_lock_irqsave(&xfs_err_lock,flags);
 	len = vsprintf(message, fmt, ap);
-	if (message[len-1] != '\n')
+	if (level != CE_DEBUG && message[len-1] != '\n')
 		strcat(message, "\n");
 	spin_unlock_irqrestore(&xfs_err_lock,flags);
 	printk("%s%s", err_level[level], message);
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index e3bf58112e7e..4f54dca662a8 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -33,9 +33,6 @@ extern void cmn_err(int, char *, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern void assfail(char *expr, char *f, int l);
 
-#define prdev(fmt,targ,args...) \
-	printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
-
 #define ASSERT_ALWAYS(expr)	\
 	(unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 2539af34eb63..4b0cb474be4c 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,12 +21,10 @@
 #include "xfs_bit.h"
 #include "xfs_inum.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -39,15 +37,15 @@
 #include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
 
-STATIC int	xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
+STATIC int	xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
 STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
 STATIC void	xfs_acl_get_endian(xfs_acl_t *);
 STATIC int	xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
 STATIC int	xfs_acl_invalid(xfs_acl_t *);
 STATIC void	xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void	xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
-STATIC void	xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
-STATIC int	xfs_acl_allow_set(vnode_t *, int);
+STATIC void	xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void	xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
+STATIC int	xfs_acl_allow_set(bhv_vnode_t *, int);
 
 kmem_zone_t *xfs_acl_zone;
 
@@ -57,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
  */
 int
 xfs_acl_vhasacl_access(
-	vnode_t		*vp)
+	bhv_vnode_t	*vp)
 {
 	int		error;
 
@@ -70,7 +68,7 @@ xfs_acl_vhasacl_access(
  */
 int
 xfs_acl_vhasacl_default(
-	vnode_t		*vp)
+	bhv_vnode_t	*vp)
 {
 	int		error;
 
@@ -209,7 +207,7 @@ posix_acl_xfs_to_xattr(
 
 int
 xfs_acl_vget(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	void		*acl,
 	size_t		size,
 	int		kind)
@@ -241,10 +239,10 @@ xfs_acl_vget(
 			goto out;
 		}
 		if (kind == _ACL_TYPE_ACCESS) {
-			vattr_t	va;
+			bhv_vattr_t	va;
 
 			va.va_mask = XFS_AT_MODE;
-			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+			error = bhv_vop_getattr(vp, &va, 0, sys_cred);
 			if (error)
 				goto out;
 			xfs_acl_sync_mode(va.va_mode, xfs_acl);
@@ -260,7 +258,7 @@ out:
 
 int
 xfs_acl_vremove(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	int		kind)
 {
 	int		error;
@@ -268,9 +266,9 @@ xfs_acl_vremove(
 	VN_HOLD(vp);
 	error = xfs_acl_allow_set(vp, kind);
 	if (!error) {
-		VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
-				SGI_ACL_DEFAULT: SGI_ACL_FILE,
-				ATTR_ROOT, sys_cred, error);
+		error = bhv_vop_attr_remove(vp, kind == _ACL_TYPE_DEFAULT?
+						SGI_ACL_DEFAULT: SGI_ACL_FILE,
+						ATTR_ROOT, sys_cred);
 		if (error == ENOATTR)
 			error = 0;	/* 'scool */
 	}
@@ -280,7 +278,7 @@ xfs_acl_vremove(
 
 int
 xfs_acl_vset(
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	void			*acl,
 	size_t			size,
 	int			kind)
@@ -370,10 +368,10 @@ xfs_acl_iaccess(
 
 STATIC int
 xfs_acl_allow_set(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	int		kind)
 {
-	vattr_t		va;
+	bhv_vattr_t	va;
 	int		error;
 
 	if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
@@ -383,7 +381,7 @@ xfs_acl_allow_set(
 	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 		return EROFS;
 	va.va_mask = XFS_AT_UID;
-	VOP_GETATTR(vp, &va, 0, NULL, error);
+	error = bhv_vop_getattr(vp, &va, 0, NULL);
 	if (error)
 		return error;
 	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
@@ -606,7 +604,7 @@ xfs_acl_get_endian(
  */
 STATIC void
 xfs_acl_get_attr(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*aclp,
 	int		kind,
 	int		flags,
@@ -616,9 +614,9 @@ xfs_acl_get_attr(
 
 	ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
 	flags |= ATTR_ROOT;
-	VOP_ATTR_GET(vp,
-		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
-		(char *)aclp, &len, flags, sys_cred, *error);
+	*error = bhv_vop_attr_get(vp, kind == _ACL_TYPE_ACCESS ?
+					SGI_ACL_FILE : SGI_ACL_DEFAULT,
+					(char *)aclp, &len, flags, sys_cred);
 	if (*error || (flags & ATTR_KERNOVAL))
 		return;
 	xfs_acl_get_endian(aclp);
@@ -629,7 +627,7 @@ xfs_acl_get_attr(
  */
 STATIC void
 xfs_acl_set_attr(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*aclp,
 	int		kind,
 	int		*error)
@@ -654,19 +652,19 @@ xfs_acl_set_attr(
 		INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
 	}
 	INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-	VOP_ATTR_SET(vp,
-		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
-		(char *)newacl, len, ATTR_ROOT, sys_cred, *error);
+	*error = bhv_vop_attr_set(vp, kind == _ACL_TYPE_ACCESS ?
+				SGI_ACL_FILE: SGI_ACL_DEFAULT,
+				(char *)newacl, len, ATTR_ROOT, sys_cred);
 	_ACL_FREE(newacl);
 }
 
 int
 xfs_acl_vtoacl(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*access_acl,
 	xfs_acl_t	*default_acl)
 {
-	vattr_t		va;
+	bhv_vattr_t	va;
 	int		error = 0;
 
 	if (access_acl) {
@@ -678,7 +676,7 @@ xfs_acl_vtoacl(
 		if (!error) {
 			/* Got the ACL, need the mode... */
 			va.va_mask = XFS_AT_MODE;
-			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+			error = bhv_vop_getattr(vp, &va, 0, sys_cred);
 		}
 
 		if (error)
@@ -701,8 +699,8 @@ xfs_acl_vtoacl(
  */
 int
 xfs_acl_inherit(
-	vnode_t		*vp,
-	vattr_t		*vap,
+	bhv_vnode_t	*vp,
+	bhv_vattr_t	*vap,
 	xfs_acl_t	*pdaclp)
 {
 	xfs_acl_t	*cacl;
@@ -757,11 +755,11 @@ xfs_acl_inherit(
  */
 STATIC int
 xfs_acl_setmode(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*acl,
 	int		*basicperms)
 {
-	vattr_t		va;
+	bhv_vattr_t	va;
 	xfs_acl_entry_t	*ap;
 	xfs_acl_entry_t	*gap = NULL;
 	int		i, error, nomask = 1;
@@ -776,7 +774,7 @@ xfs_acl_setmode(
 	 * mode.  The m:: bits take precedence over the g:: bits.
 	 */
 	va.va_mask = XFS_AT_MODE;
-	VOP_GETATTR(vp, &va, 0, sys_cred, error);
+	error = bhv_vop_getattr(vp, &va, 0, sys_cred);
 	if (error)
 		return error;
 
@@ -810,8 +808,7 @@ xfs_acl_setmode(
 	if (gap && nomask)
 		va.va_mode |= gap->ae_perm << 3;
 
-	VOP_SETATTR(vp, &va, 0, sys_cred, error);
-	return error;
+	return bhv_vop_setattr(vp, &va, 0, sys_cred);
 }
 
 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 538d0d65b04c..f853cf1a6270 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -50,7 +50,7 @@ typedef struct xfs_acl {
 #ifdef CONFIG_XFS_POSIX_ACL
 
 struct vattr;
-struct vnode;
+struct bhv_vnode;
 struct xfs_inode;
 
 extern struct kmem_zone *xfs_acl_zone;
@@ -58,14 +58,14 @@ extern struct kmem_zone *xfs_acl_zone;
 		(zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)	kmem_zone_destroy(zone)
 
-extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
+extern int xfs_acl_inherit(struct bhv_vnode *, struct bhv_vattr *, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct vnode *);
-extern int xfs_acl_vhasacl_default(struct vnode *);
-extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct vnode *vp, int);
+extern int xfs_acl_vtoacl(struct bhv_vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vhasacl_access(struct bhv_vnode *);
+extern int xfs_acl_vhasacl_default(struct bhv_vnode *);
+extern int xfs_acl_vset(struct bhv_vnode *, void *, size_t, int);
+extern int xfs_acl_vget(struct bhv_vnode *, void *, size_t, int);
+extern int xfs_acl_vremove(struct bhv_vnode *, int);
 
 #define _ACL_TYPE_ACCESS	1
 #define _ACL_TYPE_DEFAULT	2
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 8558226281c4..eef6763f3a67 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1862,7 +1860,7 @@ xfs_alloc_fix_freelist(
 		(pag->pagf_longest - delta) :
 		(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
 	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-	    (args->minleft &&
+	    (!(flags & XFS_ALLOC_FLAG_FREEING) &&
 	     (int)(pag->pagf_freeblks + pag->pagf_flcount -
 		   need - args->total) <
 	     (int)args->minleft)) {
@@ -1898,7 +1896,7 @@ xfs_alloc_fix_freelist(
 	longest = (longest > delta) ? (longest - delta) :
 		(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
 	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-	     (args->minleft &&
+	     (!(flags & XFS_ALLOC_FLAG_FREEING) &&
 		(int)(be32_to_cpu(agf->agf_freeblks) +
 		   be32_to_cpu(agf->agf_flcount) - need - args->total) <
 	     (int)args->minleft)) {
@@ -1951,8 +1949,14 @@ xfs_alloc_fix_freelist(
 		 * the restrictions correctly.  Can happen for free calls
 		 * on a completely full ag.
 		 */
-		if (targs.agbno == NULLAGBLOCK)
+		if (targs.agbno == NULLAGBLOCK) {
+			if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+				xfs_trans_brelse(tp, agflbp);
+				args->agbp = NULL;
+				return 0;
+			}
 			break;
+		}
 		/*
 		 * Put each allocated block on the list.
 		 */
@@ -2360,8 +2364,19 @@ xfs_alloc_vextent(
 			if (args->agno == sagno &&
 			    type == XFS_ALLOCTYPE_START_BNO)
 				args->type = XFS_ALLOCTYPE_THIS_AG;
-			if (++(args->agno) == mp->m_sb.sb_agcount)
-				args->agno = 0;
+			/*
+			* For the first allocation, we can try any AG to get
+			* space.  However, if we already have allocated a
+			* block, we don't want to try AGs whose number is below
+			* sagno. Otherwise, we may end up with out-of-order
+			* locking of AGF, which might cause deadlock.
+			*/
+			if (++(args->agno) == mp->m_sb.sb_agcount) {
+				if (args->firstblock != NULLFSBLOCK)
+					args->agno = sagno;
+				else
+					args->agno = 0;
+			}
 			/*
 			 * Reached the starting a.g., must either be done
 			 * or switch to non-trylock mode.
@@ -2443,7 +2458,7 @@ xfs_free_extent(
 	args.minlen = args.minleft = args.minalignslop = 0;
 	down_read(&args.mp->m_peraglock);
 	args.pag = &args.mp->m_perag[args.agno];
-	if ((error = xfs_alloc_fix_freelist(&args, 0)))
+	if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
 		goto error0;
 #ifdef DEBUG
 	ASSERT(args.agbp != NULL);
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2d1f8928b267..650591f999ae 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -41,6 +41,7 @@ typedef enum xfs_alloctype
  * Flags for xfs_alloc_fix_freelist.
  */
 #define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+#define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
 
 /*
  * Argument structure for xfs_alloc routines.
@@ -70,6 +71,7 @@ typedef struct xfs_alloc_arg {
 	char		wasfromfl;	/* set if allocation is from freelist */
 	char		isfl;		/* set if is freelist blocks - !acctg */
 	char		userdata;	/* set if this is user data */
+	xfs_fsblock_t	firstblock;	/* io first block allocated */
 } xfs_alloc_arg_t;
 
 /*
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index a1d92da86ccd..7446556e8021 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b6e1e02bbb28..1a2101043275 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -27,7 +27,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -35,7 +34,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1910,7 +1908,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
 				  args->rmtblkcnt,
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  NULL, 0, map, &nmap, NULL);
+				  NULL, 0, map, &nmap, NULL, NULL);
 		if (error)
 			return(error);
 		ASSERT(nmap >= 1);
@@ -1988,7 +1986,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
 							XFS_BMAPI_WRITE,
 				  args->firstblock, args->total, &map, &nmap,
-				  args->flist);
+				  args->flist, NULL);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						*args->firstblock, &committed);
@@ -2039,7 +2037,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
 				  args->rmtblkcnt,
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  args->firstblock, 0, &map, &nmap, NULL);
+				  args->firstblock, 0, &map, &nmap,
+				  NULL, NULL);
 		if (error) {
 			return(error);
 		}
@@ -2104,7 +2103,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 					args->rmtblkcnt,
 					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 					args->firstblock, 0, &map, &nmap,
-					args->flist);
+					args->flist, NULL);
 		if (error) {
 			return(error);
 		}
@@ -2142,7 +2141,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		XFS_BMAP_INIT(args->flist, args->firstblock);
 		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
 				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				    1, args->firstblock, args->flist, &done);
+				    1, args->firstblock, args->flist,
+				    NULL, &done);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						*args->firstblock, &committed);
@@ -2322,56 +2322,56 @@ xfs_attr_trace_enter(int type, char *where,
 
 STATIC int
 posix_acl_access_set(
-	vnode_t	*vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
 }
 
 STATIC int
 posix_acl_access_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
 	return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
 }
 
 STATIC int
 posix_acl_access_get(
-	vnode_t *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
 }
 
 STATIC int
 posix_acl_access_exists(
-	vnode_t *vp)
+	bhv_vnode_t *vp)
 {
 	return xfs_acl_vhasacl_access(vp);
 }
 
 STATIC int
 posix_acl_default_set(
-	vnode_t	*vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
 }
 
 STATIC int
 posix_acl_default_get(
-	vnode_t *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
 }
 
 STATIC int
 posix_acl_default_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
 	return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
 }
 
 STATIC int
 posix_acl_default_exists(
-	vnode_t *vp)
+	bhv_vnode_t *vp)
 {
 	return xfs_acl_vhasacl_default(vp);
 }
@@ -2404,21 +2404,18 @@ STATIC struct attrnames *attr_system_names[] =
 
 STATIC int
 attr_generic_set(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
-	int 	error;
-
-	VOP_ATTR_SET(vp, name, data, size, xflags, NULL, error);
-	return -error;
+	return -bhv_vop_attr_set(vp, name, data, size, xflags, NULL);
 }
 
 STATIC int
 attr_generic_get(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	int	error, asize = size;
 
-	VOP_ATTR_GET(vp, name, data, &asize, xflags, NULL, error);
+	error = bhv_vop_attr_get(vp, name, data, &asize, xflags, NULL);
 	if (!error)
 		return asize;
 	return -error;
@@ -2426,12 +2423,9 @@ attr_generic_get(
 
 STATIC int
 attr_generic_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
-	int	error;
-
-	VOP_ATTR_REMOVE(vp, name, xflags, NULL, error);
-	return -error;
+	return -bhv_vop_attr_remove(vp, name, xflags, NULL);
 }
 
 STATIC int
@@ -2459,7 +2453,7 @@ attr_generic_listadd(
 
 STATIC int
 attr_system_list(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	void			*data,
 	size_t			size,
 	ssize_t			*result)
@@ -2481,12 +2475,12 @@ attr_system_list(
 
 int
 attr_generic_list(
-	struct vnode *vp, void *data, size_t size, int xflags, ssize_t *result)
+	bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result)
 {
 	attrlist_cursor_kern_t	cursor = { 0 };
 	int			error;
 
-	VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
+	error = bhv_vop_attr_list(vp, data, size, xflags, &cursor, NULL);
 	if (error > 0)
 		return -error;
 	*result = -error;
@@ -2514,7 +2508,7 @@ attr_lookup_namespace(
  */
 STATIC int
 attr_user_capable(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	cred_t		*cred)
 {
 	struct inode	*inode = vn_to_inode(vp);
@@ -2532,7 +2526,7 @@ attr_user_capable(
 
 STATIC int
 attr_trusted_capable(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	cred_t		*cred)
 {
 	struct inode	*inode = vn_to_inode(vp);
@@ -2546,7 +2540,7 @@ attr_trusted_capable(
 
 STATIC int
 attr_secure_capable(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	cred_t		*cred)
 {
 	return -ENOSECURITY;
@@ -2554,7 +2548,7 @@ attr_secure_capable(
 
 STATIC int
 attr_system_set(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	attrnames_t	*namesp;
 	int		error;
@@ -2573,7 +2567,7 @@ attr_system_set(
 
 STATIC int
 attr_system_get(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	attrnames_t	*namesp;
 
@@ -2585,7 +2579,7 @@ attr_system_get(
 
 STATIC int
 attr_system_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
 	attrnames_t	*namesp;
 
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index b2c7b9fcded3..981633f6c077 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -36,13 +36,13 @@
  *========================================================================*/
 
 struct cred;
-struct vnode;
+struct bhv_vnode;
 
-typedef int (*attrset_t)(struct vnode *, char *, void *, size_t, int);
-typedef int (*attrget_t)(struct vnode *, char *, void *, size_t, int);
-typedef int (*attrremove_t)(struct vnode *, char *, int);
-typedef int (*attrexists_t)(struct vnode *);
-typedef int (*attrcapable_t)(struct vnode *, struct cred *);
+typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int);
+typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int);
+typedef int (*attrremove_t)(struct bhv_vnode *, char *, int);
+typedef int (*attrexists_t)(struct bhv_vnode *);
+typedef int (*attrcapable_t)(struct bhv_vnode *, struct cred *);
 
 typedef struct attrnames {
 	char *		attr_name;
@@ -63,7 +63,7 @@ extern struct attrnames attr_trusted;
 extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
 
 extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
-extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
+extern int attr_generic_list(struct bhv_vnode *, void *, size_t, int, ssize_t *);
 
 #define ATTR_DONTFOLLOW	0x0001	/* -- unused, from IRIX -- */
 #define ATTR_ROOT	0x0002	/* use attrs in root (trusted) namespace */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 9462be86aa14..9455051f0120 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -34,7 +33,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -2990,7 +2988,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 		nmap = 1;
 		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
 					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-					NULL, 0, &map, &nmap, NULL);
+					NULL, 0, &map, &nmap, NULL, NULL);
 		if (error) {
 			return(error);
 		}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 26939d364bc4..3a6137539064 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -24,13 +24,11 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -40,13 +38,15 @@
 #include "xfs_mount.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
 #include "xfs_inode_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_rw.h"
 #include "xfs_quota.h"
@@ -101,6 +101,7 @@ xfs_bmap_add_extent(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
@@ -118,6 +119,7 @@ xfs_bmap_add_extent_delay_real(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
@@ -131,6 +133,7 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp,/* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
@@ -144,6 +147,7 @@ xfs_bmap_add_extent_hole_real(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork); /* data or attr fork */
 
 /*
@@ -156,7 +160,8 @@ xfs_bmap_add_extent_unwritten_real(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp); /* inode logging flags */
+	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta); /* Change made to incore extents */
 
 /*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -203,6 +208,7 @@ xfs_bmap_del_extent(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp,/* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd);	 /* OK to allocate reserved blocks */
 
@@ -510,7 +516,7 @@ xfs_bmap_add_attrfork_local(
 		dargs.total = mp->m_dirblkfsbs;
 		dargs.whichfork = XFS_DATA_FORK;
 		dargs.trans = tp;
-		error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
+		error = xfs_dir2_sf_to_block(&dargs);
 	} else
 		error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
 			XFS_DATA_FORK);
@@ -530,6 +536,7 @@ xfs_bmap_add_extent(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd)	/* OK to use reserved data blocks */
 {
@@ -567,6 +574,15 @@ xfs_bmap_add_extent(
 			logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
 		} else
 			logflags = 0;
+		/* DELTA: single new extent */
+		if (delta) {
+			if (delta->xed_startoff > new->br_startoff)
+				delta->xed_startoff = new->br_startoff;
+			if (delta->xed_blockcount <
+					new->br_startoff + new->br_blockcount)
+				delta->xed_blockcount = new->br_startoff +
+						new->br_blockcount;
+		}
 	}
 	/*
 	 * Any kind of new delayed allocation goes here.
@@ -576,7 +592,7 @@ xfs_bmap_add_extent(
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
 		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
-				&logflags, rsvd)))
+				&logflags, delta, rsvd)))
 			goto done;
 	}
 	/*
@@ -587,7 +603,7 @@ xfs_bmap_add_extent(
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
 		if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-				&logflags, whichfork)))
+				&logflags, delta, whichfork)))
 			goto done;
 	} else {
 		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
@@ -612,17 +628,17 @@ xfs_bmap_add_extent(
 						XFS_BTCUR_BPRV_WASDEL);
 				if ((error = xfs_bmap_add_extent_delay_real(ip,
 					idx, &cur, new, &da_new, first, flist,
-					&logflags, rsvd)))
+					&logflags, delta, rsvd)))
 					goto done;
 			} else if (new->br_state == XFS_EXT_NORM) {
 				ASSERT(new->br_state == XFS_EXT_NORM);
 				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags)))
+					ip, idx, &cur, new, &logflags, delta)))
 					goto done;
 			} else {
 				ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
 				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags)))
+					ip, idx, &cur, new, &logflags, delta)))
 					goto done;
 			}
 			ASSERT(*curp == cur || *curp == NULL);
@@ -635,7 +651,7 @@ xfs_bmap_add_extent(
 				ASSERT((cur->bc_private.b.flags &
 					XFS_BTCUR_BPRV_WASDEL) == 0);
 			if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-					new, &logflags, whichfork)))
+					new, &logflags, delta, whichfork)))
 				goto done;
 		}
 	}
@@ -700,6 +716,7 @@ xfs_bmap_add_extent_delay_real(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd)	/* OK to use reserved data block allocation */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
@@ -716,8 +733,8 @@ xfs_bmap_add_extent_delay_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
-	xfs_filblks_t		temp;	/* value for dnew calculations */
-	xfs_filblks_t		temp2;	/* value for dnew calculations */
+	xfs_filblks_t		temp=0;	/* value for dnew calculations */
+	xfs_filblks_t		temp2=0;/* value for dnew calculations */
 	int			tmp_rval;	/* partial logging flags */
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
@@ -839,6 +856,11 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
+		/* DELTA: Three in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
@@ -872,6 +894,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
@@ -906,6 +932,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, RIGHT_FILLING):
@@ -936,6 +966,9 @@ xfs_bmap_add_extent_delay_real(
 			ASSERT(i == 1);
 		}
 		*dnew = 0;
+		/* DELTA: The in-core extent described by new changed type. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, LEFT_CONTIG):
@@ -978,6 +1011,10 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
 			XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK(LEFT_FILLING):
@@ -1025,6 +1062,9 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
 			XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
@@ -1067,6 +1107,10 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
 			XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK(RIGHT_FILLING):
@@ -1112,6 +1156,9 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
 		xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case 0:
@@ -1194,6 +1241,9 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
 			XFS_DATA_FORK);
 		*dnew = temp + temp2;
+		/* DELTA: One in-core extent is split in three. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
@@ -1209,6 +1259,13 @@ xfs_bmap_add_extent_delay_real(
 		ASSERT(0);
 	}
 	*curp = cur;
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
+	}
 done:
 	*logflagsp = rval;
 	return error;
@@ -1235,7 +1292,8 @@ xfs_bmap_add_extent_unwritten_real(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp) /* inode logging flags */
+	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta) /* Change made to incore extents */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
 	xfs_bmbt_rec_t		*ep;	/* extent entry for idx */
@@ -1252,6 +1310,8 @@ xfs_bmap_add_extent_unwritten_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;
+	xfs_filblks_t		temp2=0;
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
 		LEFT_FILLING,	RIGHT_FILLING,
@@ -1380,6 +1440,11 @@ xfs_bmap_add_extent_unwritten_real(
 				RIGHT.br_blockcount, LEFT.br_state)))
 				goto done;
 		}
+		/* DELTA: Three in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
@@ -1419,6 +1484,10 @@ xfs_bmap_add_extent_unwritten_real(
 				LEFT.br_state)))
 				goto done;
 		}
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
@@ -1459,6 +1528,10 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, RIGHT_FILLING):
@@ -1487,6 +1560,9 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
+		/* DELTA: The in-core extent described by new changed type. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, LEFT_CONTIG):
@@ -1534,6 +1610,10 @@ xfs_bmap_add_extent_unwritten_real(
 				LEFT.br_state))
 				goto done;
 		}
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK(LEFT_FILLING):
@@ -1574,6 +1654,9 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			ASSERT(i == 1);
 		}
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
@@ -1617,6 +1700,10 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK(RIGHT_FILLING):
@@ -1657,6 +1744,9 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			ASSERT(i == 1);
 		}
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case 0:
@@ -1710,6 +1800,9 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			ASSERT(i == 1);
 		}
+		/* DELTA: One in-core extent is split in three. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
@@ -1725,6 +1818,13 @@ xfs_bmap_add_extent_unwritten_real(
 		ASSERT(0);
 	}
 	*curp = cur;
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
+	}
 done:
 	*logflagsp = rval;
 	return error;
@@ -1753,6 +1853,7 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd)		/* OK to allocate reserved blocks */
 {
 	xfs_bmbt_rec_t		*ep;	/* extent record for idx */
@@ -1765,7 +1866,8 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_filblks_t		oldlen=0;	/* old indirect size */
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
 	int			state;  /* state bits, accessed thru macros */
-	xfs_filblks_t		temp;	/* temp for indirect calculations */
+	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
+	xfs_filblks_t		temp2=0;
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
 		LEFT_DELAY,	RIGHT_DELAY,
@@ -1844,6 +1946,9 @@ xfs_bmap_add_extent_hole_delay(
 			XFS_DATA_FORK);
 		xfs_iext_remove(ifp, idx, 1);
 		ip->i_df.if_lastex = idx - 1;
+		/* DELTA: Two in-core extents were replaced by one. */
+		temp2 = temp;
+		temp = left.br_startoff;
 		break;
 
 	case MASK(LEFT_CONTIG):
@@ -1864,6 +1969,9 @@ xfs_bmap_add_extent_hole_delay(
 		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
 			XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx - 1;
+		/* DELTA: One in-core extent grew into a hole. */
+		temp2 = temp;
+		temp = left.br_startoff;
 		break;
 
 	case MASK(RIGHT_CONTIG):
@@ -1881,6 +1989,9 @@ xfs_bmap_add_extent_hole_delay(
 			NULLSTARTBLOCK((int)newlen), temp, right.br_state);
 		xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx;
+		/* DELTA: One in-core extent grew into a hole. */
+		temp2 = temp;
+		temp = new->br_startoff;
 		break;
 
 	case 0:
@@ -1894,6 +2005,9 @@ xfs_bmap_add_extent_hole_delay(
 			XFS_DATA_FORK);
 		xfs_iext_insert(ifp, idx, 1, new);
 		ip->i_df.if_lastex = idx;
+		/* DELTA: A new in-core extent was added in a hole. */
+		temp2 = new->br_blockcount;
+		temp = new->br_startoff;
 		break;
 	}
 	if (oldlen != newlen) {
@@ -1904,6 +2018,13 @@ xfs_bmap_add_extent_hole_delay(
 		 * Nothing to do for disk quota accounting here.
 		 */
 	}
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
+	}
 	*logflagsp = 0;
 	return 0;
 #undef	MASK
@@ -1925,6 +2046,7 @@ xfs_bmap_add_extent_hole_real(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork) /* data or attr fork */
 {
 	xfs_bmbt_rec_t		*ep;	/* pointer to extent entry ins. point */
@@ -1936,7 +2058,10 @@ xfs_bmap_add_extent_hole_real(
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			rval=0;	/* return value (logging flags) */
 	int			state;	/* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;
+	xfs_filblks_t		temp2=0;
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
 		LEFT_DELAY,	RIGHT_DELAY,
@@ -1993,6 +2118,7 @@ xfs_bmap_add_extent_hole_real(
 		 left.br_blockcount + new->br_blockcount +
 		     right.br_blockcount <= MAXEXTLEN));
 
+	error = 0;
 	/*
 	 * Select which case we're in here, and implement it.
 	 */
@@ -2018,25 +2144,35 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount +
+						right.br_blockcount,
+					left.br_state)))
+				goto done;
 		}
-		*logflagsp = XFS_ILOG_CORE;
-		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
-				right.br_startblock, right.br_blockcount, &i)))
-			return error;
-		ASSERT(i == 1);
-		if ((error = xfs_bmbt_delete(cur, &i)))
-			return error;
-		ASSERT(i == 1);
-		if ((error = xfs_bmbt_decrement(cur, 0, &i)))
-			return error;
-		ASSERT(i == 1);
-		error = xfs_bmbt_update(cur, left.br_startoff,
-				left.br_startblock,
-				left.br_blockcount + new->br_blockcount +
-				right.br_blockcount, left.br_state);
-		return error;
+		/* DELTA: Two in-core extents were replaced by one. */
+		temp = left.br_startoff;
+		temp2 = left.br_blockcount +
+			new->br_blockcount +
+			right.br_blockcount;
+		break;
 
 	case MASK(LEFT_CONTIG):
 		/*
@@ -2050,19 +2186,27 @@ xfs_bmap_add_extent_hole_real(
 		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
 		ifp->if_lastex = idx - 1;
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount,
+					left.br_state)))
+				goto done;
 		}
-		*logflagsp = 0;
-		if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
-				left.br_startblock, left.br_blockcount, &i)))
-			return error;
-		ASSERT(i == 1);
-		error = xfs_bmbt_update(cur, left.br_startoff,
-				left.br_startblock,
-				left.br_blockcount + new->br_blockcount,
-				left.br_state);
-		return error;
+		/* DELTA: One in-core extent grew. */
+		temp = left.br_startoff;
+		temp2 = left.br_blockcount +
+			new->br_blockcount;
+		break;
 
 	case MASK(RIGHT_CONTIG):
 		/*
@@ -2077,19 +2221,27 @@ xfs_bmap_add_extent_hole_real(
 		xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
 		ifp->if_lastex = idx;
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+						right.br_blockcount,
+					right.br_state)))
+				goto done;
 		}
-		*logflagsp = 0;
-		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
-				right.br_startblock, right.br_blockcount, &i)))
-			return error;
-		ASSERT(i == 1);
-		error = xfs_bmbt_update(cur, new->br_startoff,
-				new->br_startblock,
-				new->br_blockcount + right.br_blockcount,
-				right.br_state);
-		return error;
+		/* DELTA: One in-core extent grew. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount +
+			right.br_blockcount;
+		break;
 
 	case 0:
 		/*
@@ -2104,29 +2256,41 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = new->br_state;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
 		}
-		*logflagsp = XFS_ILOG_CORE;
-		if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-				new->br_startblock, new->br_blockcount, &i)))
-			return error;
-		ASSERT(i == 0);
-		cur->bc_rec.b.br_state = new->br_state;
-		if ((error = xfs_bmbt_insert(cur, &i)))
-			return error;
-		ASSERT(i == 1);
-		return 0;
+		/* DELTA: A new extent was added in a hole. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount;
+		break;
+	}
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
 	}
+done:
+	*logflagsp = rval;
+	return error;
 #undef	MASK
 #undef	MASK2
 #undef	STATE_SET
 #undef	STATE_TEST
 #undef	STATE_SET_TEST
 #undef	SWITCH_STATE
-	/* NOTREACHED */
-	ASSERT(0);
-	return 0; /* keep gcc quite */
 }
 
 /*
@@ -2598,6 +2762,7 @@ xfs_bmap_btalloc(
 	args.mp = mp;
 	args.fsbno = ap->rval;
 	args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+	args.firstblock = ap->firstblock;
 	blen = 0;
 	if (nullfb) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
@@ -2657,7 +2822,7 @@ xfs_bmap_btalloc(
 		else
 			args.minlen = ap->alen;
 	} else if (ap->low) {
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.total = args.minlen = ap->minlen;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -2669,7 +2834,7 @@ xfs_bmap_btalloc(
 		args.prod = ap->ip->i_d.di_extsize;
 		if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
-	} else if (unlikely(mp->m_sb.sb_blocksize >= NBPP)) {
+	} else if (mp->m_sb.sb_blocksize >= NBPP) {
 		args.prod = 1;
 		args.mod = 0;
 	} else {
@@ -2885,6 +3050,7 @@ xfs_bmap_del_extent(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd)	/* OK to allocate reserved blocks */
 {
@@ -3193,6 +3359,14 @@ xfs_bmap_del_extent(
 	if (da_old > da_new)
 		xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
 			rsvd);
+	if (delta) {
+		/* DELTA: report the original extent. */
+		if (delta->xed_startoff > got.br_startoff)
+			delta->xed_startoff = got.br_startoff;
+		if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
+			delta->xed_blockcount = got.br_startoff +
+							got.br_blockcount;
+	}
 done:
 	*logflagsp = flags;
 	return error;
@@ -3279,6 +3453,7 @@ xfs_bmap_extents_to_btree(
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
 	args.tp = tp;
 	args.mp = mp;
+	args.firstblock = *firstblock;
 	if (*firstblock == NULLFSBLOCK) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
@@ -3414,6 +3589,7 @@ xfs_bmap_local_to_extents(
 
 		args.tp = tp;
 		args.mp = ip->i_mount;
+		args.firstblock = *firstblock;
 		ASSERT((ifp->if_flags &
 			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
 		/*
@@ -3753,7 +3929,7 @@ xfs_bunmap_trace(
 	if (ip->i_rwtrace == NULL)
 		return;
 	ktrace_enter(ip->i_rwtrace,
-		(void *)(__psint_t)XFS_BUNMAPI,
+		(void *)(__psint_t)XFS_BUNMAP,
 		(void *)ip,
 		(void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
 		(void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
@@ -4087,8 +4263,8 @@ xfs_bmap_finish(
 			if (!XFS_FORCED_SHUTDOWN(mp))
 				xfs_force_shutdown(mp,
 						   (error == EFSCORRUPTED) ?
-						   XFS_CORRUPT_INCORE :
-						   XFS_METADATA_IO_ERROR);
+						   SHUTDOWN_CORRUPT_INCORE :
+						   SHUTDOWN_META_IO_ERROR);
 			return error;
 		}
 		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
@@ -4538,7 +4714,8 @@ xfs_bmapi(
 	xfs_extlen_t	total,		/* total blocks needed */
 	xfs_bmbt_irec_t	*mval,		/* output: map values */
 	int		*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t	*flist)		/* i/o: list extents to free */
+	xfs_bmap_free_t	*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t	*delta)		/* o: change made to incore extents */
 {
 	xfs_fsblock_t	abno;		/* allocated block number */
 	xfs_extlen_t	alen;		/* allocated extent length */
@@ -4650,6 +4827,10 @@ xfs_bmapi(
 	end = bno + len;
 	obno = bno;
 	bma.ip = NULL;
+	if (delta) {
+		delta->xed_startoff = NULLFILEOFF;
+		delta->xed_blockcount = 0;
+	}
 	while (bno < end && n < *nmap) {
 		/*
 		 * Reading past eof, act as though there's a hole
@@ -4886,8 +5067,8 @@ xfs_bmapi(
 					got.br_state = XFS_EXT_UNWRITTEN;
 			}
 			error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
-				firstblock, flist, &tmp_logflags, whichfork,
-				(flags & XFS_BMAPI_RSVBLOCKS));
+				firstblock, flist, &tmp_logflags, delta,
+				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
@@ -4983,8 +5164,8 @@ xfs_bmapi(
 			}
 			mval->br_state = XFS_EXT_NORM;
 			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
-				firstblock, flist, &tmp_logflags, whichfork,
-				(flags & XFS_BMAPI_RSVBLOCKS));
+				firstblock, flist, &tmp_logflags, delta,
+				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
@@ -5073,7 +5254,14 @@ xfs_bmapi(
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
 	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
 	error = 0;
-
+	if (delta && delta->xed_startoff != NULLFILEOFF) {
+		/* A change was actually made.
+		 * Note that delta->xed_blockount is an offset at this
+		 * point and needs to be converted to a block count.
+		 */
+		ASSERT(delta->xed_blockcount > delta->xed_startoff);
+		delta->xed_blockcount -= delta->xed_startoff;
+	}
 error0:
 	/*
 	 * Log everything.  Do this after conversion, there's no point in
@@ -5185,6 +5373,8 @@ xfs_bunmapi(
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
 	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t		*delta,		/* o: change made to incore
+						   extents */
 	int			*done)		/* set if not done yet */
 {
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
@@ -5242,6 +5432,10 @@ xfs_bunmapi(
 	bno = start + len - 1;
 	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
 		&prev);
+	if (delta) {
+		delta->xed_startoff = NULLFILEOFF;
+		delta->xed_blockcount = 0;
+	}
 	/*
 	 * Check to see if the given block number is past the end of the
 	 * file, back up to the last block if so...
@@ -5340,7 +5534,8 @@ xfs_bunmapi(
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
 			error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
-				firstblock, flist, &logflags, XFS_DATA_FORK, 0);
+				firstblock, flist, &logflags, delta,
+				XFS_DATA_FORK, 0);
 			if (error)
 				goto error0;
 			goto nodelete;
@@ -5394,7 +5589,7 @@ xfs_bunmapi(
 				prev.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
 					&prev, firstblock, flist, &logflags,
-					XFS_DATA_FORK, 0);
+					delta, XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5403,7 +5598,7 @@ xfs_bunmapi(
 				del.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent(ip, lastx, &cur,
 					&del, firstblock, flist, &logflags,
-					XFS_DATA_FORK, 0);
+					delta, XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5456,7 +5651,7 @@ xfs_bunmapi(
 			goto error0;
 		}
 		error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
-			&tmp_logflags, whichfork, rsvd);
+				&tmp_logflags, delta, whichfork, rsvd);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
@@ -5513,6 +5708,14 @@ nodelete:
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	error = 0;
+	if (delta && delta->xed_startoff != NULLFILEOFF) {
+		/* A change was actually made.
+		 * Note that delta->xed_blockount is an offset at this
+		 * point and needs to be converted to a block count.
+		 */
+		ASSERT(delta->xed_blockcount > delta->xed_startoff);
+		delta->xed_blockcount -= delta->xed_startoff;
+	}
 error0:
 	/*
 	 * Log everything.  Do this after conversion, there's no point in
@@ -5556,7 +5759,7 @@ xfs_getbmap(
 	__int64_t		fixlen;		/* length for -1 case */
 	int			i;		/* extent number */
 	xfs_inode_t		*ip;		/* xfs incore inode pointer */
-	vnode_t			*vp;		/* corresponding vnode */
+	bhv_vnode_t		*vp;		/* corresponding vnode */
 	int			lock;		/* lock state */
 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
 	xfs_mount_t		*mp;		/* file system mount point */
@@ -5653,7 +5856,7 @@ xfs_getbmap(
 
 	if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
 		/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
-		VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+		error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
 	}
 
 	ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
@@ -5689,7 +5892,8 @@ xfs_getbmap(
 		nmap = (nexleft > subnex) ? subnex : nexleft;
 		error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
 				  XFS_BB_TO_FSB(mp, bmv->bmv_length),
-				  bmapi_flags, NULL, 0, map, &nmap, NULL);
+				  bmapi_flags, NULL, 0, map, &nmap,
+				  NULL, NULL);
 		if (error)
 			goto unlock_and_return;
 		ASSERT(nmap <= subnex);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 8e0d73d9ccc4..80e93409b78d 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -26,6 +26,20 @@ struct xfs_mount;
 struct xfs_trans;
 
 /*
+ * DELTA: describe a change to the in-core extent list.
+ *
+ * Internally the use of xed_blockount is somewhat funky.
+ * xed_blockcount contains an offset much of the time because this
+ * makes merging changes easier.  (xfs_fileoff_t and xfs_filblks_t are
+ * the same underlying type).
+ */
+typedef struct xfs_extdelta
+{
+	xfs_fileoff_t		xed_startoff;	/* offset of range */
+	xfs_filblks_t		xed_blockcount;	/* blocks in range */
+} xfs_extdelta_t;
+
+/*
  * List of extents to be free "later".
  * The list is kept sorted on xbf_startblock.
  */
@@ -275,7 +289,9 @@ xfs_bmapi(
 	xfs_extlen_t		total,		/* total blocks needed */
 	struct xfs_bmbt_irec	*mval,		/* output: map values */
 	int			*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t		*delta);	/* o: change made to incore
+						   extents */
 
 /*
  * Map file blocks to filesystem blocks, simple version.
@@ -309,6 +325,8 @@ xfs_bunmapi(
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
 	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t		*delta,		/* o: change made to incore
+						   extents */
 	int			*done);		/* set if not done yet */
 
 /*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bea44709afbe..18fb7385d719 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1569,12 +1567,11 @@ xfs_bmbt_split(
 	lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
 	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
 	args.fsbno = cur->bc_private.b.firstblock;
+	args.firstblock = args.fsbno;
 	if (args.fsbno == NULLFSBLOCK) {
 		args.fsbno = lbno;
 		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else if (cur->bc_private.b.flist->xbf_low)
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-	else
+	} else
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	args.mod = args.minleft = args.alignment = args.total = args.isfl =
 		args.userdata = args.minalignslop = 0;
@@ -2356,6 +2353,7 @@ xfs_bmbt_newroot(
 		args.userdata = args.minalignslop = 0;
 	args.minlen = args.maxlen = args.prod = 1;
 	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	args.firstblock = args.fsbno;
 	if (args.fsbno == NULLFSBLOCK) {
 #ifdef DEBUG
 		if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
@@ -2365,9 +2363,7 @@ xfs_bmbt_newroot(
 #endif
 		args.fsbno = INT_GET(*pp, ARCH_CONVERT);
 		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else if (args.wasdel)
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-	else
+	} else
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	if ((error = xfs_alloc_vextent(&args))) {
 		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 52d5d095fc35..ee2255bd6562 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 5fed15682dda..a4aa53974f76 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -23,7 +23,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_buf_item.h"
@@ -1030,9 +1029,9 @@ xfs_buf_iodone_callbacks(
 		if ((XFS_BUF_TARGET(bp) != lasttarg) ||
 		    (time_after(jiffies, (lasttime + 5*HZ)))) {
 			lasttime = jiffies;
-			prdev("XFS write error in file system meta-data "
-			      "block 0x%llx in %s",
-			      XFS_BUF_TARGET(bp),
+			cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+					" block 0x%llx in %s",
+				XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
 			      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
 		}
 		lasttarg = XFS_BUF_TARGET(bp);
@@ -1108,7 +1107,7 @@ xfs_buf_error_relse(
 	XFS_BUF_ERROR(bp,0);
 	xfs_buftrace("BUF_ERROR_RELSE", bp);
 	if (! XFS_FORCED_SHUTDOWN(mp))
-		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	/*
 	 * We have to unpin the pinned buffers so do the
 	 * callbacks.
diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h
index d0035c6e9514..7a0e482dd436 100644
--- a/fs/xfs/xfs_cap.h
+++ b/fs/xfs/xfs_cap.h
@@ -49,12 +49,12 @@ typedef struct xfs_cap_set {
 
 #include <linux/posix_cap_xattr.h>
 
-struct vnode;
+struct bhv_vnode;
 
-extern int xfs_cap_vhascap(struct vnode *);
-extern int xfs_cap_vset(struct vnode *, void *, size_t);
-extern int xfs_cap_vget(struct vnode *, void *, size_t);
-extern int xfs_cap_vremove(struct vnode *vp);
+extern int xfs_cap_vhascap(struct bhv_vnode *);
+extern int xfs_cap_vset(struct bhv_vnode *, void *, size_t);
+extern int xfs_cap_vget(struct bhv_vnode *, void *, size_t);
+extern int xfs_cap_vremove(struct bhv_vnode *);
 
 #define _CAP_EXISTS		xfs_cap_vhascap
 
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 8988b9051175..32ab61d17ace 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -43,7 +41,6 @@
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -159,7 +156,7 @@ xfs_da_split(xfs_da_state_t *state)
 	max = state->path.active - 1;
 	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
 	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
-	       state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
 
 	addblk = &state->path.blk[max];		/* initial dummy value */
 	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
@@ -199,38 +196,7 @@ xfs_da_split(xfs_da_state_t *state)
 				return(error);	/* GROT: attr inconsistent */
 			addblk = newblk;
 			break;
-		case XFS_DIR_LEAF_MAGIC:
-			ASSERT(XFS_DIR_IS_V1(state->mp));
-			error = xfs_dir_leaf_split(state, oldblk, newblk);
-			if ((error != 0) && (error != ENOSPC)) {
-				return(error);	/* GROT: dir is inconsistent */
-			}
-			if (!error) {
-				addblk = newblk;
-				break;
-			}
-			/*
-			 * Entry wouldn't fit, split the leaf again.
-			 */
-			state->extravalid = 1;
-			if (state->inleaf) {
-				state->extraafter = 0;	/* before newblk */
-				error = xfs_dir_leaf_split(state, oldblk,
-							   &state->extrablk);
-				if (error)
-					return(error);	/* GROT: dir incon. */
-				addblk = newblk;
-			} else {
-				state->extraafter = 1;	/* after newblk */
-				error = xfs_dir_leaf_split(state, newblk,
-							   &state->extrablk);
-				if (error)
-					return(error);	/* GROT: dir incon. */
-				addblk = newblk;
-			}
-			break;
 		case XFS_DIR2_LEAFN_MAGIC:
-			ASSERT(XFS_DIR_IS_V2(state->mp));
 			error = xfs_dir2_leafn_split(state, oldblk, newblk);
 			if (error)
 				return error;
@@ -363,7 +329,6 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
 			     (char *)oldroot);
 	} else {
-		ASSERT(XFS_DIR_IS_V2(mp));
 		ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
 		leaf = (xfs_dir2_leaf_t *)oldroot;
 		size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
@@ -379,8 +344,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	 * Set up the new root node.
 	 */
 	error = xfs_da_node_create(args,
-		args->whichfork == XFS_DATA_FORK &&
-		XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
+		(args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
 		be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
 	if (error)
 		return(error);
@@ -427,10 +391,9 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
 
 	/*
-	 * With V2 the extra block is data or freespace.
+	 * With V2 dirs the extra block is data or freespace.
 	 */
-	useextra = state->extravalid && (XFS_DIR_IS_V1(state->mp) ||
-			state->args->whichfork == XFS_ATTR_FORK);
+	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
 	newcount = 1 + useextra;
 	/*
 	 * Do we have to split the node?
@@ -624,7 +587,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
 	ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
 	ASSERT(newblk->blkno != 0);
-	if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+	if (state->args->whichfork == XFS_DATA_FORK)
 		ASSERT(newblk->blkno >= mp->m_dirleafblk &&
 		       newblk->blkno < mp->m_dirfreeblk);
 
@@ -670,7 +633,7 @@ xfs_da_join(xfs_da_state_t *state)
 	save_blk = &state->altpath.blk[ state->path.active-1 ];
 	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
 	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
-	       drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
 
 	/*
 	 * Walk back up the tree joining/deallocating as necessary.
@@ -693,17 +656,7 @@ xfs_da_join(xfs_da_state_t *state)
 				return(0);
 			xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
 			break;
-		case XFS_DIR_LEAF_MAGIC:
-			ASSERT(XFS_DIR_IS_V1(state->mp));
-			error = xfs_dir_leaf_toosmall(state, &action);
-			if (error)
-				return(error);
-			if (action == 0)
-				return(0);
-			xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
-			break;
 		case XFS_DIR2_LEAFN_MAGIC:
-			ASSERT(XFS_DIR_IS_V2(state->mp));
 			error = xfs_dir2_leafn_toosmall(state, &action);
 			if (error)
 				return error;
@@ -790,7 +743,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
 	ASSERT(bp != NULL);
 	blkinfo = bp->data;
 	if (be16_to_cpu(oldroot->hdr.level) == 1) {
-		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC ||
 		       be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC);
 	} else {
 		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC);
@@ -951,14 +904,7 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
 		if (count == 0)
 			return;
 		break;
-	case XFS_DIR_LEAF_MAGIC:
-		ASSERT(XFS_DIR_IS_V1(state->mp));
-		lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
-		if (count == 0)
-			return;
-		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		ASSERT(XFS_DIR_IS_V2(state->mp));
 		lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
 		if (count == 0)
 			return;
@@ -1117,10 +1063,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 	 * Descend thru the B-tree searching each level for the right
 	 * node to use, until the right hashval is found.
 	 */
-	if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
-		blkno = state->mp->m_dirleafblk;
-	else
-		blkno = 0;
+	blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
 	for (blk = &state->path.blk[0], state->path.active = 1;
 			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
 			 blk++, state->path.active++) {
@@ -1137,7 +1080,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		}
 		curr = blk->bp->data;
 		ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC ||
-		       be16_to_cpu(curr->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC ||
 		       be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC);
 
 		/*
@@ -1190,16 +1133,10 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 				blk->index = probe;
 				blkno = be32_to_cpu(btree->before);
 			}
-		}
-		else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
+		} else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
 			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
 			break;
-		}
-		else if (be16_to_cpu(curr->magic) == XFS_DIR_LEAF_MAGIC) {
-			blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
-			break;
-		}
-		else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
+		} else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
 			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
 			break;
 		}
@@ -1212,12 +1149,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 	 * next leaf and keep searching.
 	 */
 	for (;;) {
-		if (blk->magic == XFS_DIR_LEAF_MAGIC) {
-			ASSERT(XFS_DIR_IS_V1(state->mp));
-			retval = xfs_dir_leaf_lookup_int(blk->bp, args,
-								  &blk->index);
-		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
-			ASSERT(XFS_DIR_IS_V2(state->mp));
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
 			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
 							&blk->index, state);
 		}
@@ -1270,7 +1202,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 	old_info = old_blk->bp->data;
 	new_info = new_blk->bp->data;
 	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
-	       old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
 	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
 	ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
 	ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
@@ -1280,12 +1212,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 	case XFS_ATTR_LEAF_MAGIC:
 		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
 		break;
-	case XFS_DIR_LEAF_MAGIC:
-		ASSERT(XFS_DIR_IS_V1(state->mp));
-		before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
-		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		ASSERT(XFS_DIR_IS_V2(state->mp));
 		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
 		break;
 	case XFS_DA_NODE_MAGIC:
@@ -1404,7 +1331,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	save_info = save_blk->bp->data;
 	drop_info = drop_blk->bp->data;
 	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
-	       save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
 	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
 	ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
 	ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
@@ -1529,7 +1456,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 		ASSERT(blk->bp != NULL);
 		info = blk->bp->data;
 		ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC ||
-		       be16_to_cpu(info->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC ||
 		       be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
 		blk->magic = be16_to_cpu(info->magic);
 		if (blk->magic == XFS_DA_NODE_MAGIC) {
@@ -1548,20 +1475,13 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 				blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
 								      NULL);
 				break;
-			case XFS_DIR_LEAF_MAGIC:
-				ASSERT(XFS_DIR_IS_V1(state->mp));
-				blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
-								     NULL);
-				break;
 			case XFS_DIR2_LEAFN_MAGIC:
-				ASSERT(XFS_DIR_IS_V2(state->mp));
 				blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
 								       NULL);
 				break;
 			default:
 				ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
-				       blk->magic ==
-				       XFS_DIRX_LEAF_MAGIC(state->mp));
+				       blk->magic == XFS_DIR2_LEAFN_MAGIC);
 				break;
 			}
 		}
@@ -1620,7 +1540,6 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	xfs_bmbt_irec_t	*mapp;
 	xfs_inode_t *dp;
 	int nmap, error, w, count, c, got, i, mapi;
-	xfs_fsize_t size;
 	xfs_trans_t *tp;
 	xfs_mount_t *mp;
 
@@ -1631,7 +1550,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	/*
 	 * For new directories adjust the file offset and block count.
 	 */
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
+	if (w == XFS_DATA_FORK) {
 		bno = mp->m_dirleafblk;
 		count = mp->m_dirblkfsbs;
 	} else {
@@ -1641,10 +1560,9 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	/*
 	 * Find a spot in the file space to put the new block.
 	 */
-	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w)))
 		return error;
-	}
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+	if (w == XFS_DATA_FORK)
 		ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
 	/*
 	 * Try mapping it in one filesystem block.
@@ -1655,7 +1573,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 			XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
 			XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
-			args->flist))) {
+			args->flist, NULL))) {
 		return error;
 	}
 	ASSERT(nmap <= 1);
@@ -1676,7 +1594,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 					XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
 					XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->flist))) {
+					&mapp[mapi], &nmap, args->flist,
+					NULL))) {
 				kmem_free(mapp, sizeof(*mapp) * count);
 				return error;
 			}
@@ -1705,19 +1624,6 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	if (mapp != &map)
 		kmem_free(mapp, sizeof(*mapp) * count);
 	*new_blkno = (xfs_dablk_t)bno;
-	/*
-	 * For version 1 directories, adjust the file size if it changed.
-	 */
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
-		ASSERT(mapi == 1);
-		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
-			return error;
-		size = XFS_FSB_TO_B(mp, bno);
-		if (size != dp->i_d.di_size) {
-			dp->i_d.di_size = size;
-			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-		}
-	}
 	return 0;
 }
 
@@ -1742,7 +1648,6 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 	int error, w, entno, level, dead_level;
 	xfs_da_blkinfo_t *dead_info, *sib_info;
 	xfs_da_intnode_t *par_node, *dead_node;
-	xfs_dir_leafblock_t *dead_leaf;
 	xfs_dir2_leaf_t *dead_leaf2;
 	xfs_dahash_t dead_hash;
 
@@ -1753,11 +1658,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 	w = args->whichfork;
 	ASSERT(w == XFS_DATA_FORK);
 	mp = ip->i_mount;
-	if (XFS_DIR_IS_V2(mp)) {
-		lastoff = mp->m_dirfreeblk;
-		error = xfs_bmap_last_before(tp, ip, &lastoff, w);
-	} else
-		error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
+	lastoff = mp->m_dirfreeblk;
+	error = xfs_bmap_last_before(tp, ip, &lastoff, w);
 	if (error)
 		return error;
 	if (unlikely(lastoff == 0)) {
@@ -1780,14 +1682,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 	/*
 	 * Get values from the moved block.
 	 */
-	if (be16_to_cpu(dead_info->magic) == XFS_DIR_LEAF_MAGIC) {
-		ASSERT(XFS_DIR_IS_V1(mp));
-		dead_leaf = (xfs_dir_leafblock_t *)dead_info;
-		dead_level = 0;
-		dead_hash =
-			INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
-	} else if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
-		ASSERT(XFS_DIR_IS_V2(mp));
+	if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
 		dead_level = 0;
 		dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
@@ -1842,7 +1737,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 		xfs_da_buf_done(sib_buf);
 		sib_buf = NULL;
 	}
-	par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
+	par_blkno = mp->m_dirleafblk;
 	level = -1;
 	/*
 	 * Walk down the tree looking for the parent of the moved block.
@@ -1941,8 +1836,6 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 {
 	xfs_inode_t *dp;
 	int done, error, w, count;
-	xfs_fileoff_t bno;
-	xfs_fsize_t size;
 	xfs_trans_t *tp;
 	xfs_mount_t *mp;
 
@@ -1950,7 +1843,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 	w = args->whichfork;
 	tp = args->trans;
 	mp = dp->i_mount;
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+	if (w == XFS_DATA_FORK)
 		count = mp->m_dirblkfsbs;
 	else
 		count = 1;
@@ -1961,34 +1854,17 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 		 */
 		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
 				XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
-				0, args->firstblock, args->flist,
+				0, args->firstblock, args->flist, NULL,
 				&done)) == ENOSPC) {
 			if (w != XFS_DATA_FORK)
-				goto done;
+				break;
 			if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
 					&dead_buf)))
-				goto done;
-		} else if (error)
-			goto done;
-		else
+				break;
+		} else {
 			break;
-	}
-	ASSERT(done);
-	xfs_da_binval(tp, dead_buf);
-	/*
-	 * Adjust the directory size for version 1.
-	 */
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
-		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
-			return error;
-		size = XFS_FSB_TO_B(dp->i_mount, bno);
-		if (size != dp->i_d.di_size) {
-			dp->i_d.di_size = size;
-			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 		}
 	}
-	return 0;
-done:
 	xfs_da_binval(tp, dead_buf);
 	return error;
 }
@@ -2049,10 +1925,7 @@ xfs_da_do_buf(
 	xfs_dabuf_t	*rbp;
 
 	mp = dp->i_mount;
-	if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
-		nfsb = mp->m_dirblkfsbs;
-	else
-		nfsb = 1;
+	nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
 	mappedbno = *mappedbnop;
 	/*
 	 * Caller doesn't have a mapping.  -2 means don't complain
@@ -2086,7 +1959,7 @@ xfs_da_do_buf(
 					nfsb,
 					XFS_BMAPI_METADATA |
 						XFS_BMAPI_AFLAG(whichfork),
-					NULL, 0, mapp, &nmap, NULL)))
+					NULL, 0, mapp, &nmap, NULL, NULL)))
 				goto exit0;
 		}
 	} else {
@@ -2198,7 +2071,6 @@ xfs_da_do_buf(
 		magic1 = be32_to_cpu(data->hdr.magic);
 		if (unlikely(
 		    XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
-				   (magic != XFS_DIR_LEAF_MAGIC) &&
 				   (magic != XFS_ATTR_LEAF_MAGIC) &&
 				   (magic != XFS_DIR2_LEAF1_MAGIC) &&
 				   (magic != XFS_DIR2_LEAFN_MAGIC) &&
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 243a730d5ec8..4ab865ec8b82 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -36,14 +36,10 @@ struct zone;
  * level in the Btree, and to identify which type of block this is.
  */
 #define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
-#define XFS_DIR_LEAF_MAGIC	0xfeeb	/* magic number: directory leaf blks */
 #define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
 #define	XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
 #define	XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
 
-#define	XFS_DIRX_LEAF_MAGIC(mp)	\
-	(XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
-
 typedef struct xfs_da_blkinfo {
 	__be32		forw;			/* previous block in list */
 	__be32		back;			/* following block in list */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 4968a6358e61..80562b60fb95 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -54,24 +52,14 @@ xfs_swapext(
 	xfs_swapext_t	__user *sxu)
 {
 	xfs_swapext_t	*sxp;
-	xfs_inode_t     *ip=NULL, *tip=NULL, *ips[2];
-	xfs_trans_t     *tp;
+	xfs_inode_t     *ip=NULL, *tip=NULL;
 	xfs_mount_t     *mp;
-	xfs_bstat_t	*sbp;
 	struct file	*fp = NULL, *tfp = NULL;
-	vnode_t		*vp, *tvp;
-	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
-	int		ilf_fields, tilf_fields;
+	bhv_vnode_t	*vp, *tvp;
 	int		error = 0;
-	xfs_ifork_t	*tempifp, *ifp, *tifp;
-	__uint64_t	tmp;
-	int		aforkblks = 0;
-	int		taforkblks = 0;
-	char		locked = 0;
 
 	sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
-	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
-	if (!sxp || !tempifp) {
+	if (!sxp) {
 		error = XFS_ERROR(ENOMEM);
 		goto error0;
 	}
@@ -118,14 +106,56 @@ xfs_swapext(
 
 	mp = ip->i_mount;
 
-	sbp = &sxp->sx_stat;
-
 	if (XFS_FORCED_SHUTDOWN(mp)) {
 		error =  XFS_ERROR(EIO);
 		goto error0;
 	}
 
-	locked = 1;
+	error = XFS_SWAP_EXTENTS(mp, &ip->i_iocore, &tip->i_iocore, sxp);
+
+ error0:
+	if (fp != NULL)
+		fput(fp);
+	if (tfp != NULL)
+		fput(tfp);
+
+	if (sxp != NULL)
+		kmem_free(sxp, sizeof(xfs_swapext_t));
+
+	return error;
+}
+
+int
+xfs_swap_extents(
+	xfs_inode_t	*ip,
+	xfs_inode_t	*tip,
+	xfs_swapext_t	*sxp)
+{
+	xfs_mount_t	*mp;
+	xfs_inode_t	*ips[2];
+	xfs_trans_t	*tp;
+	xfs_bstat_t	*sbp = &sxp->sx_stat;
+	bhv_vnode_t	*vp, *tvp;
+	xfs_ifork_t	*tempifp, *ifp, *tifp;
+	int		ilf_fields, tilf_fields;
+	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+	int		error = 0;
+	int		aforkblks = 0;
+	int		taforkblks = 0;
+	__uint64_t	tmp;
+	char		locked = 0;
+
+	mp = ip->i_mount;
+
+	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+	if (!tempifp) {
+		error = XFS_ERROR(ENOMEM);
+		goto error0;
+	}
+
+	sbp = &sxp->sx_stat;
+	vp = XFS_ITOV(ip);
+	tvp = XFS_ITOV(tip);
 
 	/* Lock in i_ino order */
 	if (ip->i_ino < tip->i_ino) {
@@ -137,6 +167,7 @@ xfs_swapext(
 	}
 
 	xfs_lock_inodes(ips, 2, 0, lock_flags);
+	locked = 1;
 
 	/* Check permissions */
 	error = xfs_iaccess(ip, S_IWUSR, NULL);
@@ -169,7 +200,7 @@ xfs_swapext(
 
 	if (VN_CACHED(tvp) != 0) {
 		xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1);
-		VOP_FLUSHINVAL_PAGES(tvp, 0, -1, FI_REMAPF_LOCKED);
+		bhv_vop_flushinval_pages(tvp, 0, -1, FI_REMAPF_LOCKED);
 	}
 
 	/* Verify O_DIRECT for ftmp */
@@ -214,7 +245,7 @@ xfs_swapext(
 	/* We need to fail if the file is memory mapped.  Once we have tossed
 	 * all existing pages, the page fault will have no option
 	 * but to go to the filesystem for pages. By making the page fault call
-	 * VOP_READ (or write in the case of autogrow) they block on the iolock
+	 * vop_read (or write in the case of autogrow) they block on the iolock
 	 * until we have switched the extents.
 	 */
 	if (VN_MAPPED(vp)) {
@@ -233,7 +264,7 @@ xfs_swapext(
 	 * fields change.
 	 */
 
-	VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+	bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
 	if ((error = xfs_trans_reserve(tp, 0,
@@ -360,16 +391,7 @@ xfs_swapext(
 		xfs_iunlock(ip,  lock_flags);
 		xfs_iunlock(tip, lock_flags);
 	}
-
-	if (fp != NULL)
-		fput(fp);
-	if (tfp != NULL)
-		fput(tfp);
-
-	if (sxp != NULL)
-		kmem_free(sxp, sizeof(xfs_swapext_t));
 	if (tempifp != NULL)
 		kmem_free(tempifp, sizeof(xfs_ifork_t));
-
 	return error;
 }
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index f678559abc45..da178205be68 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,6 +48,9 @@ typedef struct xfs_swapext
  */
 int	xfs_swapext(struct xfs_swapext __user *sx);
 
+int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+		struct xfs_swapext *sxp);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 79d0d9e1fbab..b33826961c45 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -85,7 +85,6 @@ typedef struct xfs_dinode
 	union {
 		xfs_bmdr_block_t di_bmbt;	/* btree root block */
 		xfs_bmbt_rec_32_t di_bmx[1];	/* extent list */
-		xfs_dir_shortform_t di_dirsf;	/* shortform directory */
 		xfs_dir2_sf_t	di_dir2sf;	/* shortform directory v2 */
 		char		di_c[1];	/* local contents */
 		xfs_dev_t	di_dev;		/* device for S_IFCHR/S_IFBLK */
@@ -257,6 +256,7 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_NOSYMLINKS_BIT   10	/* disallow symlink creation */
 #define XFS_DIFLAG_EXTSIZE_BIT      11	/* inode extent size allocator hint */
 #define XFS_DIFLAG_EXTSZINHERIT_BIT 12	/* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13	/* do not reorganize/defragment */
 #define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
 #define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
 #define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -270,12 +270,13 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
 #define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
 #define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
 
 #define XFS_DIFLAG_ANY \
 	(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
 	 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
 	 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
 	 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
-	 XFS_DIFLAG_EXTSZINHERIT)
+	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG)
 
 #endif	/* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
deleted file mode 100644
index 9cc702a839a3..000000000000
--- a/fs/xfs/xfs_dir.c
+++ /dev/null
@@ -1,1217 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-
-/*
- * xfs_dir.c
- *
- * Provide the external interfaces to manage directories.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Functions for the dirops interfaces.
- */
-static void	xfs_dir_mount(struct xfs_mount *mp);
-
-static int	xfs_dir_isempty(struct xfs_inode *dp);
-
-static int	xfs_dir_init(struct xfs_trans *trans,
-			     struct xfs_inode *dir,
-			     struct xfs_inode *parent_dir);
-
-static int	xfs_dir_createname(struct xfs_trans *trans,
-				   struct xfs_inode *dp,
-				   char *name_string,
-				   int name_len,
-				   xfs_ino_t inode_number,
-				   xfs_fsblock_t *firstblock,
-				   xfs_bmap_free_t *flist,
-				   xfs_extlen_t total);
-
-static int	xfs_dir_lookup(struct xfs_trans *tp,
-			       struct xfs_inode *dp,
-			       char *name_string,
-			       int name_length,
-			       xfs_ino_t *inode_number);
-
-static int	xfs_dir_removename(struct xfs_trans *trans,
-				   struct xfs_inode *dp,
-				   char *name_string,
-				   int name_length,
-				   xfs_ino_t ino,
-				   xfs_fsblock_t *firstblock,
-				   xfs_bmap_free_t *flist,
-				   xfs_extlen_t total);
-
-static int	xfs_dir_getdents(struct xfs_trans *tp,
-				 struct xfs_inode *dp,
-				 struct uio *uiop,
-				 int *eofp);
-
-static int	xfs_dir_replace(struct xfs_trans *tp,
-				struct xfs_inode *dp,
-				char *name_string,
-				int name_length,
-				xfs_ino_t inode_number,
-				xfs_fsblock_t *firstblock,
-				xfs_bmap_free_t *flist,
-				xfs_extlen_t total);
-
-static int	xfs_dir_canenter(struct xfs_trans *tp,
-				 struct xfs_inode *dp,
-				 char *name_string,
-				 int name_length);
-
-static int	xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
-						  xfs_dinode_t *dip);
-
-xfs_dirops_t xfsv1_dirops = {
-	.xd_mount			= xfs_dir_mount,
-	.xd_isempty			= xfs_dir_isempty,
-	.xd_init			= xfs_dir_init,
-	.xd_createname			= xfs_dir_createname,
-	.xd_lookup			= xfs_dir_lookup,
-	.xd_removename			= xfs_dir_removename,
-	.xd_getdents			= xfs_dir_getdents,
-	.xd_replace			= xfs_dir_replace,
-	.xd_canenter			= xfs_dir_canenter,
-	.xd_shortform_validate_ondisk	= xfs_dir_shortform_validate_ondisk,
-	.xd_shortform_to_single		= xfs_dir_shortform_to_leaf,
-};
-
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
-						 int *total_namebytes);
-STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
-					     uio_t *uio, int *eofp,
-					     xfs_dirent_t *dbp,
-					     xfs_dir_put_t put);
-STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
-
-/*
- * Internal routines when dirsize > XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
-STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
-STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
-					     uio_t *uio, int *eofp,
-					     xfs_dirent_t *dbp,
-					     xfs_dir_put_t put);
-STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
-
-#if defined(XFS_DIR_TRACE)
-ktrace_t *xfs_dir_trace_buf;
-#endif
-
-
-/*========================================================================
- * Overall external interface routines.
- *========================================================================*/
-
-xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-/*
- * One-time startup routine called from xfs_init().
- */
-void
-xfs_dir_startup(void)
-{
-	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
-	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
-}
-
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir_mount(xfs_mount_t *mp)
-{
-	uint shortcount, leafcount, count;
-
-	mp->m_dirversion = 1;
-	if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
-		shortcount = (mp->m_attroffset -
-				(uint)sizeof(xfs_dir_sf_hdr_t)) /
-				 (uint)sizeof(xfs_dir_sf_entry_t);
-		leafcount = (XFS_LBSIZE(mp) -
-				(uint)sizeof(xfs_dir_leaf_hdr_t)) /
-				 ((uint)sizeof(xfs_dir_leaf_entry_t) +
-				  (uint)sizeof(xfs_dir_leaf_name_t));
-	} else {
-		shortcount = (XFS_BMDR_SPACE_CALC(MINABTPTRS) -
-			      (uint)sizeof(xfs_dir_sf_hdr_t)) /
-			       (uint)sizeof(xfs_dir_sf_entry_t);
-		leafcount = (XFS_LBSIZE(mp) -
-			    (uint)sizeof(xfs_dir_leaf_hdr_t)) /
-			     ((uint)sizeof(xfs_dir_leaf_entry_t) +
-			      (uint)sizeof(xfs_dir_leaf_name_t));
-	}
-	count = shortcount > leafcount ? shortcount : leafcount;
-	mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
-	ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
-	mp->m_dir_node_ents = mp->m_attr_node_ents =
-		(XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
-		(uint)sizeof(xfs_da_node_entry_t);
-	mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
-	mp->m_dirblksize = mp->m_sb.sb_blocksize;
-	mp->m_dirblkfsbs = 1;
-}
-
-/*
- * Return 1 if directory contains only "." and "..".
- */
-static int
-xfs_dir_isempty(xfs_inode_t *dp)
-{
-	xfs_dir_sf_hdr_t *hdr;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if (dp->i_d.di_size == 0)
-		return(1);
-	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
-		return(0);
-	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	return(hdr->count == 0);
-}
-
-/*
- * Initialize a directory with its "." and ".." entries.
- */
-static int
-xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
-{
-	xfs_da_args_t args;
-	int error;
-
-	memset((char *)&args, 0, sizeof(args));
-	args.dp = dir;
-	args.trans = trans;
-
-	ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
-		return error;
-
-	return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
-}
-
-/*
- * Generic handler routine to add a name to a directory.
- * Transitions directory from shortform to Btree as necessary.
- */
-static int							/* error */
-xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
-		   int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
-		   xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-	xfs_da_args_t args;
-	int retval, newsize, done;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
-		return (retval);
-
-	XFS_STATS_INC(xs_dir_create);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = inum;
-	args.dp = dp;
-	args.firstblock = firstblock;
-	args.flist = flist;
-	args.total = total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = 0;
-	args.addname = args.oknoent = 1;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	done = 0;
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
-		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
-			retval = xfs_dir_shortform_addname(&args);
-			done = 1;
-		} else {
-			if (total == 0)
-				return XFS_ERROR(ENOSPC);
-			retval = xfs_dir_shortform_to_leaf(&args);
-			done = retval != 0;
-		}
-	}
-	if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_addname(&args);
-		done = retval != ENOSPC;
-		if (!done) {
-			if (total == 0)
-				return XFS_ERROR(ENOSPC);
-			retval = xfs_dir_leaf_to_node(&args);
-			done = retval != 0;
-		}
-	}
-	if (!done) {
-		retval = xfs_dir_node_addname(&args);
-	}
-	return(retval);
-}
-
-/*
- * Generic handler routine to check if a name can be added to a directory,
- * without adding any blocks to the directory.
- */
-static int							/* error */
-xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
-{
-	xfs_da_args_t args;
-	int retval, newsize;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
-	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = args.oknoent = 1;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
-		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
-			retval = 0;
-		else
-			retval = XFS_ERROR(ENOSPC);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_addname(&args);
-	} else {
-		retval = xfs_dir_node_addname(&args);
-	}
-	return(retval);
-}
-
-/*
- * Generic handler routine to remove a name from a directory.
- * Transitions directory from Btree to shortform as necessary.
- */
-static int							/* error */
-xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
-		   int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
-		   xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-	xfs_da_args_t args;
-	int count, totallen, newsize, retval;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	XFS_STATS_INC(xs_dir_remove);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = ino;
-	args.dp = dp;
-	args.firstblock = firstblock;
-	args.flist = flist;
-	args.total = total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = args.oknoent = 0;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_removename(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_removename(&args, &count, &totallen);
-		if (retval == 0) {
-			newsize = XFS_DIR_SF_ALLFIT(count, totallen);
-			if (newsize <= XFS_IFORK_DSIZE(dp)) {
-				retval = xfs_dir_leaf_to_shortform(&args);
-			}
-		}
-	} else {
-		retval = xfs_dir_node_removename(&args);
-	}
-	return(retval);
-}
-
-static int							/* error */
-xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
-				   xfs_ino_t *inum)
-{
-	xfs_da_args_t args;
-	int retval;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	XFS_STATS_INC(xs_dir_lookup);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
-	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = 0;
-	args.oknoent = 1;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_lookup(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_lookup(&args);
-	} else {
-		retval = xfs_dir_node_lookup(&args);
-	}
-	if (retval == EEXIST)
-		retval = 0;
-	*inum = args.inumber;
-	return(retval);
-}
-
-/*
- * Implement readdir.
- */
-static int							/* error */
-xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
-{
-	xfs_dirent_t *dbp;
-	int  alignment, retval;
-	xfs_dir_put_t put;
-
-	XFS_STATS_INC(xs_dir_getdents);
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	/*
-	 * If our caller has given us a single contiguous memory buffer,
-	 * just work directly within that buffer.  If it's in user memory,
-	 * lock it down first.
-	 */
-	alignment = sizeof(xfs_off_t) - 1;
-	if ((uio->uio_iovcnt == 1) &&
-	    (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
-	    ((uio->uio_iov[0].iov_len & alignment) == 0)) {
-		dbp = NULL;
-		put = xfs_dir_put_dirent64_direct;
-	} else {
-		dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
-		put = xfs_dir_put_dirent64_uio;
-	}
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	*eofp = 0;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
-	} else {
-		retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
-	}
-	if (dbp != NULL)
-		kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
-
-	return(retval);
-}
-
-static int							/* error */
-xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
-				    xfs_ino_t inum, xfs_fsblock_t *firstblock,
-				    xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-	xfs_da_args_t args;
-	int retval;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
-		return retval;
-
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = inum;
-	args.dp = dp;
-	args.firstblock = firstblock;
-	args.flist = flist;
-	args.total = total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = args.oknoent = 0;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_replace(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_replace(&args);
-	} else {
-		retval = xfs_dir_node_replace(&args);
-	}
-
-	return(retval);
-}
-
-static int
-xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
-{
-	xfs_ino_t		ino;
-	int			namelen_sum;
-	int			count;
-	xfs_dir_shortform_t	*sf;
-	xfs_dir_sf_entry_t	*sfe;
-	int			i;
-
-
-
-	if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) {
-		return 0;
-	}
-	if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
-		return 0;
-	}
-	if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
-		xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p",
-			dp);
-		return 1;
-	}
-	sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
-	ino = XFS_GET_DIR_INO8(sf->hdr.parent);
-	if (xfs_dir_ino_validate(mp, ino))
-		return 1;
-
-	count =	sf->hdr.count;
-	if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
-		xfs_fs_cmn_err(CE_WARN, mp,
-			"Invalid shortform count: dp 0x%p", dp);
-		return(1);
-	}
-
-	if (count == 0) {
-		return 0;
-	}
-
-	namelen_sum = 0;
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count - 1; i >= 0; i--) {
-		ino = XFS_GET_DIR_INO8(sfe->inumber);
-		xfs_dir_ino_validate(mp, ino);
-		if (sfe->namelen >= XFS_LITINO(mp)) {
-			xfs_fs_cmn_err(CE_WARN, mp,
-				"Invalid shortform namelen: dp 0x%p", dp);
-			return 1;
-		}
-		namelen_sum += sfe->namelen;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	if (namelen_sum >= XFS_LITINO(mp)) {
-		xfs_fs_cmn_err(CE_WARN, mp,
-			"Invalid shortform namelen: dp 0x%p", dp);
-		return 1;
-	}
-
-	return 0;
-}
-
-/*========================================================================
- * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
- *========================================================================*/
-
-/*
- * Add a name to the leaf directory structure
- * This is the external routine.
- */
-int
-xfs_dir_leaf_addname(xfs_da_args_t *args)
-{
-	int index, retval;
-	xfs_dabuf_t *bp;
-
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	if (retval == ENOENT)
-		retval = xfs_dir_leaf_add(bp, args, index);
-	xfs_da_buf_done(bp);
-	return(retval);
-}
-
-/*
- * Remove a name from the leaf directory structure
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
-{
-	xfs_dir_leafblock_t *leaf;
-	int index, retval;
-	xfs_dabuf_t *bp;
-
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	if (retval == EEXIST) {
-		(void)xfs_dir_leaf_remove(args->trans, bp, index);
-		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-		*totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-		retval = 0;
-	}
-	xfs_da_buf_done(bp);
-	return(retval);
-}
-
-/*
- * Look up a name in a leaf directory structure.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_lookup(xfs_da_args_t *args)
-{
-	int index, retval;
-	xfs_dabuf_t *bp;
-
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	xfs_da_brelse(args->trans, bp);
-	return(retval);
-}
-
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-STATIC int
-xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
-				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-	xfs_dabuf_t *bp;
-	int retval, eob;
-
-	retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
-	xfs_da_brelse(trans, bp);
-	*eofp = (eob == 0);
-	return(retval);
-}
-
-/*
- * Look up a name in a leaf directory structure, replace the inode number.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_replace(xfs_da_args_t *args)
-{
-	int index, retval;
-	xfs_dabuf_t *bp;
-	xfs_ino_t inum;
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-
-	inum = args->inumber;
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	if (retval == EEXIST) {
-		leaf = bp->data;
-		entry = &leaf->entries[index];
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-		/* XXX - replace assert? */
-		XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
-		xfs_da_log_buf(args->trans, bp,
-		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
-		xfs_da_buf_done(bp);
-		retval = 0;
-	} else
-		xfs_da_brelse(args->trans, bp);
-	return(retval);
-}
-
-
-/*========================================================================
- * External routines when dirsize > XFS_LBSIZE(mp).
- *========================================================================*/
-
-/*
- * Add a name to a Btree-format directory.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_addname(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	int retval, error;
-
-	/*
-	 * Fill in bucket of arguments/results/context to carry around.
-	 */
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-
-	/*
-	 * Search to see if name already exists, and get back a pointer
-	 * to where it should go.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error)
-		retval = error;
-	if (retval != ENOENT)
-		goto error;
-	blk = &state->path.blk[ state->path.active-1 ];
-	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-	retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
-	if (retval == 0) {
-		/*
-		 * Addition succeeded, update Btree hashvals.
-		 */
-		if (!args->justcheck)
-			xfs_da_fixhashpath(state, &state->path);
-	} else {
-		/*
-		 * Addition failed, split as many Btree elements as required.
-		 */
-		if (args->total == 0) {
-			ASSERT(retval == ENOSPC);
-			goto error;
-		}
-		retval = xfs_da_split(state);
-	}
-error:
-	xfs_da_state_free(state);
-
-	return(retval);
-}
-
-/*
- * Remove a name from a B-tree directory.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_removename(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	int retval, error;
-
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-
-	/*
-	 * Search to see if name exists, and get back a pointer to it.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error)
-		retval = error;
-	if (retval != EEXIST) {
-		xfs_da_state_free(state);
-		return(retval);
-	}
-
-	/*
-	 * Remove the name and update the hashvals in the tree.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-	retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
-	xfs_da_fixhashpath(state, &state->path);
-
-	/*
-	 * Check to see if the tree needs to be collapsed.
-	 */
-	error = 0;
-	if (retval) {
-		error = xfs_da_join(state);
-	}
-
-	xfs_da_state_free(state);
-	if (error)
-		return(error);
-	return(0);
-}
-
-/*
- * Look up a filename in a int directory.
- * Use an internal routine to actually do all the work.
- */
-STATIC int
-xfs_dir_node_lookup(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	int retval, error, i;
-
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-
-	/*
-	 * Search to see if name exists,
-	 * and get back a pointer to it.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error) {
-		retval = error;
-	}
-
-	/*
-	 * If not in a transaction, we have to release all the buffers.
-	 */
-	for (i = 0; i < state->path.active; i++) {
-		xfs_da_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-
-	xfs_da_state_free(state);
-	return(retval);
-}
-
-STATIC int
-xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
-				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-	xfs_da_intnode_t *node;
-	xfs_da_node_entry_t *btree;
-	xfs_dir_leafblock_t *leaf = NULL;
-	xfs_dablk_t bno, nextbno;
-	xfs_dahash_t cookhash;
-	xfs_mount_t *mp;
-	int error, eob, i;
-	xfs_dabuf_t *bp;
-	xfs_daddr_t nextda;
-
-	/*
-	 * Pick up our context.
-	 */
-	mp = dp->i_mount;
-	bp = NULL;
-	bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
-	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-
-	xfs_dir_trace_g_du("node: start", dp, uio);
-
-	/*
-	 * Re-find our place, even if we're confused about what our place is.
-	 *
-	 * First we check the block number from the magic cookie, it is a
-	 * cache of where we ended last time.  If we find a leaf block, and
-	 * the starting hashval in that block is less than our desired
-	 * hashval, then we run with it.
-	 */
-	if (bno > 0) {
-		error = xfs_da_read_buf(trans, dp, bno, -2, &bp, XFS_DATA_FORK);
-		if ((error != 0) && (error != EFSCORRUPTED))
-			return(error);
-		if (bp)
-			leaf = bp->data;
-		if (bp && be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC) {
-			xfs_dir_trace_g_dub("node: block not a leaf",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-			bp = NULL;
-		}
-		if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
-			xfs_dir_trace_g_dub("node: leaf hash too large",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-			bp = NULL;
-		}
-		if (bp &&
-		    cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
-			xfs_dir_trace_g_dub("node: leaf hash too small",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-			bp = NULL;
-		}
-	}
-
-	/*
-	 * If we did not find a leaf block from the blockno in the cookie,
-	 * or we there was no blockno in the cookie (eg: first time thru),
-	 * the we start at the top of the Btree and re-find our hashval.
-	 */
-	if (bp == NULL) {
-		xfs_dir_trace_g_du("node: start at root" , dp, uio);
-		bno = 0;
-		for (;;) {
-			error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
-						       XFS_DATA_FORK);
-			if (error)
-				return(error);
-			if (bp == NULL)
-				return(XFS_ERROR(EFSCORRUPTED));
-			node = bp->data;
-			if (be16_to_cpu(node->hdr.info.magic) != XFS_DA_NODE_MAGIC)
-				break;
-			btree = &node->btree[0];
-			xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
-			for (i = 0; i < be16_to_cpu(node->hdr.count); btree++, i++) {
-				if (be32_to_cpu(btree->hashval) >= cookhash) {
-					bno = be32_to_cpu(btree->before);
-					break;
-				}
-			}
-			if (i == be16_to_cpu(node->hdr.count)) {
-				xfs_da_brelse(trans, bp);
-				xfs_dir_trace_g_du("node: hash beyond EOF",
-							  dp, uio);
-				uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
-							     XFS_DA_MAXHASH);
-				*eofp = 1;
-				return(0);
-			}
-			xfs_dir_trace_g_dub("node: going to block",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-		}
-	}
-	ASSERT(cookhash != XFS_DA_MAXHASH);
-
-	/*
-	 * We've dropped down to the (first) leaf block that contains the
-	 * hashval we are interested in.  Continue rolling upward thru the
-	 * leaf blocks until we fill up our buffer.
-	 */
-	for (;;) {
-		leaf = bp->data;
-		if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC)) {
-			xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
-			xfs_da_brelse(trans, bp);
-			XFS_CORRUPTION_ERROR("xfs_dir_node_getdents(1)",
-					     XFS_ERRLEVEL_LOW, mp, leaf);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-		xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
-		if ((nextbno = be32_to_cpu(leaf->hdr.info.forw))) {
-			nextda = xfs_da_reada_buf(trans, dp, nextbno,
-						  XFS_DATA_FORK);
-		} else
-			nextda = -1;
-		error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
-						  put, nextda);
-		xfs_da_brelse(trans, bp);
-		bno = nextbno;
-		if (eob) {
-			xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
-			*eofp = 0;
-			return(error);
-		}
-		if (bno == 0)
-			break;
-		error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
-					XFS_DATA_FORK);
-		if (error)
-			return(error);
-		if (unlikely(bp == NULL)) {
-			XFS_ERROR_REPORT("xfs_dir_node_getdents(2)",
-					 XFS_ERRLEVEL_LOW, mp);
-			return(XFS_ERROR(EFSCORRUPTED));
-		}
-	}
-	*eofp = 1;
-	xfs_dir_trace_g_du("node: E-O-F", dp, uio);
-	return(0);
-}
-
-/*
- * Look up a filename in an int directory, replace the inode number.
- * Use an internal routine to actually do the lookup.
- */
-STATIC int
-xfs_dir_node_replace(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	xfs_ino_t inum;
-	int retval, error, i;
-	xfs_dabuf_t *bp;
-
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-	inum = args->inumber;
-
-	/*
-	 * Search to see if name exists,
-	 * and get back a pointer to it.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error) {
-		retval = error;
-	}
-
-	if (retval == EEXIST) {
-		blk = &state->path.blk[state->path.active - 1];
-		ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-		bp = blk->bp;
-		leaf = bp->data;
-		entry = &leaf->entries[blk->index];
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-		/* XXX - replace assert ? */
-		XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
-		xfs_da_log_buf(args->trans, bp,
-		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
-		xfs_da_buf_done(bp);
-		blk->bp = NULL;
-		retval = 0;
-	} else {
-		i = state->path.active - 1;
-		xfs_da_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-	for (i = 0; i < state->path.active - 1; i++) {
-		xfs_da_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-
-	xfs_da_state_free(state);
-	return(retval);
-}
-
-#if defined(XFS_DIR_TRACE)
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)bno,
-		     NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
-			xfs_da_intnode_t *node)
-{
-	int	last = be16_to_cpu(node->hdr.count) - 1;
-
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)be32_to_cpu(node->hdr.info.forw),
-		     (void *)(unsigned long)
-			be16_to_cpu(node->hdr.count),
-		     (void *)(unsigned long)
-			be32_to_cpu(node->btree[0].hashval),
-		     (void *)(unsigned long)
-			be32_to_cpu(node->btree[last].hashval),
-		     NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
-			xfs_dir_leafblock_t *leaf)
-{
-	int	last = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
-
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)be32_to_cpu(leaf->hdr.info.forw),
-		     (void *)(unsigned long)
-			INT_GET(leaf->hdr.count, ARCH_CONVERT),
-		     (void *)(unsigned long)
-			INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
-		     (void *)(unsigned long)
-			INT_GET(leaf->entries[last].hashval, ARCH_CONVERT),
-		     NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
-			xfs_dir_leaf_entry_t *entry)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)
-			INT_GET(entry->hashval, ARCH_CONVERT),
-		     NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)((unsigned long)(cookie >> 32)),
-		     (void *)((unsigned long)(cookie & 0xFFFFFFFF)),
-		     NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-void
-xfs_dir_trace_enter(int type, char *where,
-			void * a0, void * a1,
-			void * a2, void * a3,
-			void * a4, void * a5,
-			void * a6, void * a7,
-			void * a8, void * a9,
-			void * a10, void * a11)
-{
-	ASSERT(xfs_dir_trace_buf);
-	ktrace_enter(xfs_dir_trace_buf, (void *)(unsigned long)type,
-					(void *)where,
-					(void *)a0, (void *)a1, (void *)a2,
-					(void *)a3, (void *)a4, (void *)a5,
-					(void *)a6, (void *)a7, (void *)a8,
-					(void *)a9, (void *)a10, (void *)a11,
-					NULL, NULL);
-}
-#endif	/* XFS_DIR_TRACE */
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
deleted file mode 100644
index 8cc8afb9f6c0..000000000000
--- a/fs/xfs/xfs_dir.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_H__
-#define	__XFS_DIR_H__
-
-/*
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes.  Filenames are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of a filename may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- *
- * Small directories use a different format and are packed as tightly
- * as possible so as to fit into the literal area of the inode.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-struct uio;
-struct xfs_bmap_free;
-struct xfs_da_args;
-struct xfs_dinode;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * Directory function types.
- * Put in structures (xfs_dirops_t) for v1 and v2 directories.
- */
-typedef void	(*xfs_dir_mount_t)(struct xfs_mount *mp);
-typedef int	(*xfs_dir_isempty_t)(struct xfs_inode *dp);
-typedef int	(*xfs_dir_init_t)(struct xfs_trans *tp,
-				  struct xfs_inode *dp,
-				  struct xfs_inode *pdp);
-typedef int	(*xfs_dir_createname_t)(struct xfs_trans *tp,
-					struct xfs_inode *dp,
-					char *name,
-					int namelen,
-					xfs_ino_t inum,
-					xfs_fsblock_t *first,
-					struct xfs_bmap_free *flist,
-					xfs_extlen_t total);
-typedef int	(*xfs_dir_lookup_t)(struct xfs_trans *tp,
-				    struct xfs_inode *dp,
-				    char *name,
-				    int namelen,
-				    xfs_ino_t *inum);
-typedef int	(*xfs_dir_removename_t)(struct xfs_trans *tp,
-					struct xfs_inode *dp,
-					char *name,
-					int namelen,
-					xfs_ino_t ino,
-					xfs_fsblock_t *first,
-					struct xfs_bmap_free *flist,
-					xfs_extlen_t total);
-typedef int	(*xfs_dir_getdents_t)(struct xfs_trans *tp,
-				      struct xfs_inode *dp,
-				      struct uio *uio,
-				      int *eofp);
-typedef int	(*xfs_dir_replace_t)(struct xfs_trans *tp,
-				     struct xfs_inode *dp,
-				     char *name,
-				     int namelen,
-				     xfs_ino_t inum,
-				     xfs_fsblock_t *first,
-				     struct xfs_bmap_free *flist,
-				     xfs_extlen_t total);
-typedef int	(*xfs_dir_canenter_t)(struct xfs_trans *tp,
-				      struct xfs_inode *dp,
-				      char *name,
-				      int namelen);
-typedef int	(*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
-						       struct xfs_dinode *dip);
-typedef int	(*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
-
-typedef struct xfs_dirops {
-	xfs_dir_mount_t				xd_mount;
-	xfs_dir_isempty_t			xd_isempty;
-	xfs_dir_init_t				xd_init;
-	xfs_dir_createname_t			xd_createname;
-	xfs_dir_lookup_t			xd_lookup;
-	xfs_dir_removename_t			xd_removename;
-	xfs_dir_getdents_t			xd_getdents;
-	xfs_dir_replace_t			xd_replace;
-	xfs_dir_canenter_t			xd_canenter;
-	xfs_dir_shortform_validate_ondisk_t	xd_shortform_validate_ondisk;
-	xfs_dir_shortform_to_single_t		xd_shortform_to_single;
-} xfs_dirops_t;
-
-/*
- * Overall external interface routines.
- */
-void	xfs_dir_startup(void);	/* called exactly once */
-
-#define	XFS_DIR_MOUNT(mp)	\
-	((mp)->m_dirops.xd_mount(mp))
-#define	XFS_DIR_ISEMPTY(mp,dp)	\
-	((mp)->m_dirops.xd_isempty(dp))
-#define	XFS_DIR_INIT(mp,tp,dp,pdp)	\
-	((mp)->m_dirops.xd_init(tp,dp,pdp))
-#define	XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
-	((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
-				      total))
-#define	XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum)	\
-	((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
-#define	XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total)	\
-	((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
-#define	XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp)	\
-	((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
-#define	XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total)	\
-	((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
-#define	XFS_DIR_CANENTER(mp,tp,dp,name,namelen)	\
-	((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
-#define	XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip)	\
-	((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
-#define	XFS_DIR_SHORTFORM_TO_SINGLE(mp,args)	\
-	((mp)->m_dirops.xd_shortform_to_single(args))
-
-#define	XFS_DIR_IS_V1(mp)	((mp)->m_dirversion == 1)
-#define	XFS_DIR_IS_V2(mp)	((mp)->m_dirversion == 2)
-extern xfs_dirops_t xfsv1_dirops;
-extern xfs_dirops_t xfsv2_dirops;
-
-#endif	/* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 022c8398ab62..8edbe1adb95b 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -24,21 +24,18 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -46,69 +43,14 @@
 #include "xfs_dir2_trace.h"
 #include "xfs_error.h"
 
-/*
- * Declarations for interface routines.
- */
-static void	xfs_dir2_mount(xfs_mount_t *mp);
-static int	xfs_dir2_isempty(xfs_inode_t *dp);
-static int	xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
-			      xfs_inode_t *pdp);
-static int	xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
-				    char *name, int namelen, xfs_ino_t inum,
-				    xfs_fsblock_t *first,
-				    xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int	xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-				int namelen, xfs_ino_t *inum);
-static int	xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
-				    char *name, int namelen, xfs_ino_t ino,
-				    xfs_fsblock_t *first,
-				    xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int	xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
-				  int *eofp);
-static int	xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-				 int namelen, xfs_ino_t inum,
-				 xfs_fsblock_t *first, xfs_bmap_free_t *flist,
-				 xfs_extlen_t total);
-static int	xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-				  int namelen);
-static int	xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
-						   xfs_dinode_t *dip);
-
-/*
- * Utility routine declarations.
- */
 static int	xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
 static int	xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
 
-/*
- * Directory operations vector.
- */
-xfs_dirops_t	xfsv2_dirops = {
-	.xd_mount			= xfs_dir2_mount,
-	.xd_isempty			= xfs_dir2_isempty,
-	.xd_init			= xfs_dir2_init,
-	.xd_createname			= xfs_dir2_createname,
-	.xd_lookup			= xfs_dir2_lookup,
-	.xd_removename			= xfs_dir2_removename,
-	.xd_getdents			= xfs_dir2_getdents,
-	.xd_replace			= xfs_dir2_replace,
-	.xd_canenter			= xfs_dir2_canenter,
-	.xd_shortform_validate_ondisk	= xfs_dir2_shortform_validate_ondisk,
-	.xd_shortform_to_single		= xfs_dir2_sf_to_block,
-};
-
-/*
- * Interface routines.
- */
-
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir2_mount(
-	xfs_mount_t	*mp)		/* filesystem mount point */
+void
+xfs_dir_mount(
+	xfs_mount_t	*mp)
 {
-	mp->m_dirversion = 2;
+	ASSERT(XFS_SB_VERSION_HASDIRV2(&mp->m_sb));
 	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
 	       XFS_MAX_BLOCKSIZE);
 	mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
@@ -128,19 +70,15 @@ xfs_dir2_mount(
 /*
  * Return 1 if directory contains only "." and "..".
  */
-static int				/* return code */
-xfs_dir2_isempty(
-	xfs_inode_t	*dp)		/* incore inode structure */
+int
+xfs_dir_isempty(
+	xfs_inode_t	*dp)
 {
-	xfs_dir2_sf_t	*sfp;		/* shortform directory structure */
+	xfs_dir2_sf_t	*sfp;
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	/*
-	 * Might happen during shutdown.
-	 */
-	if (dp->i_d.di_size == 0) {
+	if (dp->i_d.di_size == 0)	/* might happen during shutdown. */
 		return 1;
-	}
 	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
 		return 0;
 	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
@@ -148,53 +86,83 @@ xfs_dir2_isempty(
 }
 
 /*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	xfs_agblock_t	agblkno;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		ino_ok;
+	int		ioff;
+
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agblkno = XFS_INO_TO_AGBNO(mp, ino);
+	ioff = XFS_INO_TO_OFFSET(mp, ino);
+	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+	ino_ok =
+		agno < mp->m_sb.sb_agcount &&
+		agblkno < mp->m_sb.sb_agblocks &&
+		agblkno != 0 &&
+		ioff < (1 << mp->m_sb.sb_inopblog) &&
+		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+			XFS_RANDOM_DIR_INO_VALIDATE))) {
+		xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+				(unsigned long long) ino);
+		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
  * Initialize a directory with its "." and ".." entries.
  */
-static int				/* error */
-xfs_dir2_init(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
-	xfs_inode_t	*pdp)		/* incore parent directory inode */
+int
+xfs_dir_init(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	xfs_inode_t	*pdp)
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		error;		/* error return value */
+	xfs_da_args_t	args;
+	int		error;
 
 	memset((char *)&args, 0, sizeof(args));
 	args.dp = dp;
 	args.trans = tp;
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
+	if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
 		return error;
-	}
 	return xfs_dir2_sf_create(&args, pdp->i_ino);
 }
 
 /*
   Enter a name in a directory.
  */
-static int					/* error */
-xfs_dir2_createname(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*dp,		/* incore directory inode */
-	char			*name,		/* new entry name */
-	int			namelen,	/* new entry name length */
+int
+xfs_dir_createname(
+	xfs_trans_t		*tp,
+	xfs_inode_t		*dp,
+	char			*name,
+	int			namelen,
 	xfs_ino_t		inum,		/* new entry inode number */
 	xfs_fsblock_t		*first,		/* bmap's firstblock */
 	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
 	xfs_extlen_t		total)		/* bmap's total block count */
 {
-	xfs_da_args_t		args;		/* operation arguments */
-	int			rval;		/* return value */
+	xfs_da_args_t		args;
+	int			rval;
 	int			v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
 		return rval;
-	}
 	XFS_STATS_INC(xs_dir_create);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -207,18 +175,16 @@ xfs_dir2_createname(
 	args.trans = tp;
 	args.justcheck = 0;
 	args.addname = args.oknoent = 1;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_addname(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_addname(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_addname(&args);
 	else
 		rval = xfs_dir2_node_addname(&args);
@@ -228,24 +194,21 @@ xfs_dir2_createname(
 /*
  * Lookup a name in a directory, give back the inode number.
  */
-static int				/* error */
-xfs_dir2_lookup(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
-	char		*name,		/* lookup name */
-	int		namelen,	/* lookup name length */
+int
+xfs_dir_lookup(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	char		*name,
+	int		namelen,
 	xfs_ino_t	*inum)		/* out: inode number */
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_lookup);
 
-	/*
-	 * Fill in the arg structure for this request.
-	 */
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -258,18 +221,16 @@ xfs_dir2_lookup(
 	args.trans = tp;
 	args.justcheck = args.addname = 0;
 	args.oknoent = 1;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_lookup(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_lookup(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_lookup(&args);
 	else
 		rval = xfs_dir2_node_lookup(&args);
@@ -283,26 +244,24 @@ xfs_dir2_lookup(
 /*
  * Remove an entry from a directory.
  */
-static int				/* error */
-xfs_dir2_removename(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
-	char		*name,		/* name of entry to remove */
-	int		namelen,	/* name length of entry to remove */
-	xfs_ino_t	ino,		/* inode number of entry to remove */
+int
+xfs_dir_removename(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	char		*name,
+	int		namelen,
+	xfs_ino_t	ino,
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
 	xfs_extlen_t	total)		/* bmap's total block count */
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_remove);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -314,18 +273,16 @@ xfs_dir2_removename(
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 0;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_removename(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_removename(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_removename(&args);
 	else
 		rval = xfs_dir2_node_removename(&args);
@@ -335,10 +292,10 @@ xfs_dir2_removename(
 /*
  * Read a directory.
  */
-static int				/* error */
-xfs_dir2_getdents(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+int
+xfs_dir_getdents(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	uio_t		*uio,		/* caller's buffer control */
 	int		*eofp)		/* out: eof reached */
 {
@@ -367,14 +324,11 @@ xfs_dir2_getdents(
 	}
 
 	*eofp = 0;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
 	else
 		rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
@@ -386,29 +340,26 @@ xfs_dir2_getdents(
 /*
  * Replace the inode number of a directory entry.
  */
-static int				/* error */
-xfs_dir2_replace(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+int
+xfs_dir_replace(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	char		*name,		/* name of entry to replace */
-	int		namelen,	/* name length of entry to replace */
+	int		namelen,
 	xfs_ino_t	inum,		/* new inode number */
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
 	xfs_extlen_t	total)		/* bmap's total block count */
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 
-	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
 		return rval;
-	}
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -420,18 +371,16 @@ xfs_dir2_replace(
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 0;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_replace(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_replace(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_replace(&args);
 	else
 		rval = xfs_dir2_node_replace(&args);
@@ -441,21 +390,19 @@ xfs_dir2_replace(
 /*
  * See if this entry can be added to the directory without allocating space.
  */
-static int				/* error */
-xfs_dir2_canenter(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+int
+xfs_dir_canenter(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	char		*name,		/* name of entry to add */
-	int		namelen)	/* name length of entry to add */
+	int		namelen)
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -467,18 +414,16 @@ xfs_dir2_canenter(
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 1;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_addname(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_addname(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_addname(&args);
 	else
 		rval = xfs_dir2_node_addname(&args);
@@ -486,19 +431,6 @@ xfs_dir2_canenter(
 }
 
 /*
- * Dummy routine for shortform inode validation.
- * Can't really do this.
- */
-/* ARGSUSED */
-static int				/* error */
-xfs_dir2_shortform_validate_ondisk(
-	xfs_mount_t	*mp,		/* filesystem mount point */
-	xfs_dinode_t	*dip)		/* ondisk inode */
-{
-	return 0;
-}
-
-/*
  * Utility routines.
  */
 
@@ -507,24 +439,24 @@ xfs_dir2_shortform_validate_ondisk(
  * This routine is for data and free blocks, not leaf/node blocks
  * which are handled by xfs_da_grow_inode.
  */
-int					/* error */
+int
 xfs_dir2_grow_inode(
-	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_da_args_t	*args,
 	int		space,		/* v2 dir's space XFS_DIR2_xxx_SPACE */
 	xfs_dir2_db_t	*dbp)		/* out: block number added */
 {
 	xfs_fileoff_t	bno;		/* directory offset of new block */
 	int		count;		/* count of filesystem blocks */
 	xfs_inode_t	*dp;		/* incore directory inode */
-	int		error;		/* error return value */
+	int		error;
 	int		got;		/* blocks actually mapped */
-	int		i;		/* temp mapping index */
+	int		i;
 	xfs_bmbt_irec_t	map;		/* single structure for bmap */
 	int		mapi;		/* mapping index */
 	xfs_bmbt_irec_t	*mapp;		/* bmap mapping structure(s) */
-	xfs_mount_t	*mp;		/* filesystem mount point */
+	xfs_mount_t	*mp;
 	int		nmap;		/* number of bmap entries */
-	xfs_trans_t	*tp;		/* transaction pointer */
+	xfs_trans_t	*tp;
 
 	xfs_dir2_trace_args_s("grow_inode", args, space);
 	dp = args->dp;
@@ -538,9 +470,8 @@ xfs_dir2_grow_inode(
 	/*
 	 * Find the first hole for our block.
 	 */
-	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK)))
 		return error;
-	}
 	nmap = 1;
 	ASSERT(args->firstblock != NULL);
 	/*
@@ -549,13 +480,9 @@ xfs_dir2_grow_inode(
 	if ((error = xfs_bmapi(tp, dp, bno, count,
 			XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
-			args->flist))) {
+			args->flist, NULL)))
 		return error;
-	}
 	ASSERT(nmap <= 1);
-	/*
-	 * Got it in 1.
-	 */
 	if (nmap == 1) {
 		mapp = &map;
 		mapi = 1;
@@ -585,7 +512,8 @@ xfs_dir2_grow_inode(
 			if ((error = xfs_bmapi(tp, dp, b, c,
 					XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->flist))) {
+					&mapp[mapi], &nmap, args->flist,
+					NULL))) {
 				kmem_free(mapp, sizeof(*mapp) * count);
 				return error;
 			}
@@ -645,20 +573,19 @@ xfs_dir2_grow_inode(
 /*
  * See if the directory is a single-block form directory.
  */
-int					/* error */
+int
 xfs_dir2_isblock(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	int		*vp)		/* out: 1 is block, 0 is not block */
 {
 	xfs_fileoff_t	last;		/* last file offset */
-	xfs_mount_t	*mp;		/* filesystem mount point */
-	int		rval;		/* return value */
+	xfs_mount_t	*mp;
+	int		rval;
 
 	mp = dp->i_mount;
-	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
 		return rval;
-	}
 	rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
 	ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
 	*vp = rval;
@@ -668,20 +595,19 @@ xfs_dir2_isblock(
 /*
  * See if the directory is a single-leaf form directory.
  */
-int					/* error */
+int
 xfs_dir2_isleaf(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	int		*vp)		/* out: 1 is leaf, 0 is not leaf */
 {
 	xfs_fileoff_t	last;		/* last file offset */
-	xfs_mount_t	*mp;		/* filesystem mount point */
-	int		rval;		/* return value */
+	xfs_mount_t	*mp;
+	int		rval;
 
 	mp = dp->i_mount;
-	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
 		return rval;
-	}
 	*vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
 	return 0;
 }
@@ -689,9 +615,9 @@ xfs_dir2_isleaf(
 /*
  * Getdents put routine for 64-bit ABI, direct form.
  */
-static int					/* error */
+static int
 xfs_dir2_put_dirent64_direct(
-	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+	xfs_dir2_put_args_t	*pa)
 {
 	xfs_dirent_t		*idbp;		/* dirent pointer */
 	iovec_t			*iovp;		/* io vector */
@@ -726,9 +652,9 @@ xfs_dir2_put_dirent64_direct(
 /*
  * Getdents put routine for 64-bit ABI, uio form.
  */
-static int					/* error */
+static int
 xfs_dir2_put_dirent64_uio(
-	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+	xfs_dir2_put_args_t	*pa)
 {
 	xfs_dirent_t		*idbp;		/* dirent pointer */
 	int			namelen;	/* entry name length */
@@ -764,17 +690,17 @@ xfs_dir2_put_dirent64_uio(
  */
 int
 xfs_dir2_shrink_inode(
-	xfs_da_args_t	*args,		/* operation arguments */
-	xfs_dir2_db_t	db,		/* directory block number */
-	xfs_dabuf_t	*bp)		/* block's buffer */
+	xfs_da_args_t	*args,
+	xfs_dir2_db_t	db,
+	xfs_dabuf_t	*bp)
 {
 	xfs_fileoff_t	bno;		/* directory file offset */
 	xfs_dablk_t	da;		/* directory file offset */
 	int		done;		/* bunmap is finished */
-	xfs_inode_t	*dp;		/* incore directory inode */
-	int		error;		/* error return value */
-	xfs_mount_t	*mp;		/* filesystem mount point */
-	xfs_trans_t	*tp;		/* transaction pointer */
+	xfs_inode_t	*dp;
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*tp;
 
 	xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
 	dp = args->dp;
@@ -786,7 +712,7 @@ xfs_dir2_shrink_inode(
 	 */
 	if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
 			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-			&done))) {
+			NULL, &done))) {
 		/*
 		 * ENOSPC actually can happen if we're in a removename with
 		 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 7dd364b1e038..86560b6f794c 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -22,7 +22,9 @@ struct uio;
 struct xfs_dabuf;
 struct xfs_da_args;
 struct xfs_dir2_put_args;
+struct xfs_bmap_free;
 struct xfs_inode;
+struct xfs_mount;
 struct xfs_trans;
 
 /*
@@ -73,7 +75,35 @@ typedef struct xfs_dir2_put_args {
 } xfs_dir2_put_args_t;
 
 /*
- * Other interfaces used by the rest of the dir v2 code.
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t *inum);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t ino,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+				uio_t *uio, int *eofp);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen);
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+/*
+ * Utility routines for v2 directories.
  */
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
 				xfs_dir2_db_t *dbp);
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 972ded595476..9d7438bba30d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -22,19 +22,16 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -51,6 +48,18 @@ static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
 				     int *entno);
 static int xfs_dir2_block_sort(const void *a, const void *b);
 
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+}
+
 /*
  * Add an entry to a block directory.
  */
@@ -400,7 +409,7 @@ xfs_dir2_block_addname(
 	/*
 	 * Create the new data entry.
 	 */
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, args->namelen);
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -508,7 +517,7 @@ xfs_dir2_block_getdents(
 
 		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
 						    ptr - (char *)block);
-		p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+		p.ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
 		p.ino += mp->m_inoadd;
 #endif
@@ -626,7 +635,7 @@ xfs_dir2_block_lookup(
 	/*
 	 * Fill in inode number, release the block.
 	 */
-	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	args->inumber = be64_to_cpu(dep->inumber);
 	xfs_da_brelse(args->trans, bp);
 	return XFS_ERROR(EEXIST);
 }
@@ -844,11 +853,11 @@ xfs_dir2_block_replace(
 	 */
 	dep = (xfs_dir2_data_entry_t *)
 	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address)));
-	ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
+	ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
 	/*
 	 * Change the inode number to the new value.
 	 */
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	xfs_dir2_data_log_entry(args->trans, bp, dep);
 	xfs_dir2_data_check(dp, bp);
 	xfs_da_buf_done(bp);
@@ -1130,7 +1139,7 @@ xfs_dir2_sf_to_block(
 	 */
 	dep = (xfs_dir2_data_entry_t *)
 	      ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
-	INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
+	dep->inumber = cpu_to_be64(dp->i_ino);
 	dep->namelen = 1;
 	dep->name[0] = '.';
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1144,7 +1153,7 @@ xfs_dir2_sf_to_block(
 	 */
 	dep = (xfs_dir2_data_entry_t *)
 		((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
-	INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
+	dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
 	dep->namelen = 2;
 	dep->name[0] = dep->name[1] = '.';
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1193,7 +1202,7 @@ xfs_dir2_sf_to_block(
 		 * Copy a real entry.
 		 */
 		dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
-		INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp,
+		dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp,
 				XFS_DIR2_SF_INUMBERP(sfep)));
 		dep->namelen = sfep->namelen;
 		memcpy(dep->name, sfep->name, dep->namelen);
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index bb3d03ff002b..f7c799217072 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -22,18 +22,15 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -133,7 +130,7 @@ xfs_dir2_data_check(
 		 */
 		dep = (xfs_dir2_data_entry_t *)p;
 		ASSERT(dep->namelen != 0);
-		ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
+		ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
 		ASSERT(be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep)) ==
 		       (char *)dep - (char *)d);
 		count++;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index 0847cbb53e17..a6ae2d21c40a 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -85,11 +85,11 @@ typedef struct xfs_dir2_data_hdr {
  * Tag appears as the last 2 bytes.
  */
 typedef struct xfs_dir2_data_entry {
-	xfs_ino_t		inumber;	/* inode number */
-	__uint8_t		namelen;	/* name length */
-	__uint8_t		name[1];	/* name bytes, no null */
+	__be64			inumber;	/* inode number */
+	__u8			namelen;	/* name length */
+	__u8			name[1];	/* name bytes, no null */
 						/* variable offset */
-	xfs_dir2_data_off_t	tag;		/* starting offset of us */
+	__be16			tag;		/* starting offset of us */
 } xfs_dir2_data_entry_t;
 
 /*
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0f5e2f2ce6ec..b1cf1fbf423d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -407,7 +405,7 @@ xfs_dir2_leaf_addname(
 	 * Initialize our new entry (at last).
 	 */
 	dep = (xfs_dir2_data_entry_t *)dup;
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, dep->namelen);
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -884,7 +882,7 @@ xfs_dir2_leaf_getdents(
 					XFS_DIR2_BYTE_TO_DA(mp,
 						XFS_DIR2_LEAF_OFFSET) - map_off,
 					XFS_BMAPI_METADATA, NULL, 0,
-					&map[map_valid], &nmap, NULL);
+					&map[map_valid], &nmap, NULL, NULL);
 				/*
 				 * Don't know if we should ignore this or
 				 * try to return an error.
@@ -1098,7 +1096,7 @@ xfs_dir2_leaf_getdents(
 
 		p->cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length);
 
-		p->ino = INT_GET(dep->inumber, ARCH_CONVERT);
+		p->ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
 		p->ino += mp->m_inoadd;
 #endif
@@ -1319,7 +1317,7 @@ xfs_dir2_leaf_lookup(
 	/*
 	 * Return the found inode number.
 	 */
-	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	args->inumber = be64_to_cpu(dep->inumber);
 	xfs_da_brelse(tp, dbp);
 	xfs_da_brelse(tp, lbp);
 	return XFS_ERROR(EEXIST);
@@ -1606,11 +1604,11 @@ xfs_dir2_leaf_replace(
 	dep = (xfs_dir2_data_entry_t *)
 	      ((char *)dbp->data +
 	       XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address)));
-	ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
+	ASSERT(args->inumber != be64_to_cpu(dep->inumber));
 	/*
 	 * Put the new inode number in, log it.
 	 */
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	tp = args->trans;
 	xfs_dir2_data_log_entry(tp, dbp, dep);
 	xfs_da_buf_done(dbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ac511ab9c52d..9ca71719b683 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -22,13 +22,11 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -505,7 +503,6 @@ xfs_dir2_leafn_lookup_int(
 							XFS_DATA_FORK))) {
 						return error;
 					}
-					curfdb = newfdb;
 					free = curbp->data;
 					ASSERT(be32_to_cpu(free->hdr.magic) ==
 					       XFS_DIR2_FREE_MAGIC);
@@ -527,8 +524,11 @@ xfs_dir2_leafn_lookup_int(
 				if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
 					XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
 							 XFS_ERRLEVEL_LOW, mp);
+					if (curfdb != newfdb)
+						xfs_da_brelse(tp, curbp);
 					return XFS_ERROR(EFSCORRUPTED);
 				}
+				curfdb = newfdb;
 				if (be16_to_cpu(free->bests[fi]) >= length) {
 					*indexp = index;
 					state->extravalid = 1;
@@ -580,7 +580,7 @@ xfs_dir2_leafn_lookup_int(
 			if (dep->namelen == args->namelen &&
 			    dep->name[0] == args->name[0] &&
 			    memcmp(dep->name, args->name, args->namelen) == 0) {
-				args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+				args->inumber = be64_to_cpu(dep->inumber);
 				*indexp = index;
 				state->extravalid = 1;
 				state->extrablk.bp = curbp;
@@ -970,7 +970,7 @@ xfs_dir2_leafn_remove(
 			/*
 			 * One less used entry in the free table.
 			 */
-			free->hdr.nused = cpu_to_be32(-1);
+			be32_add(&free->hdr.nused, -1);
 			xfs_dir2_free_log_header(tp, fbp);
 			/*
 			 * If this was the last entry in the table, we can
@@ -1695,7 +1695,7 @@ xfs_dir2_node_addname_int(
 	 * Fill in the new entry and log it.
 	 */
 	dep = (xfs_dir2_data_entry_t *)dup;
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, dep->namelen);
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1905,11 +1905,11 @@ xfs_dir2_node_replace(
 		dep = (xfs_dir2_data_entry_t *)
 		      ((char *)data +
 		       XFS_DIR2_DATAPTR_TO_OFF(state->mp, be32_to_cpu(lep->address)));
-		ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
+		ASSERT(inum != be64_to_cpu(dep->inumber));
 		/*
 		 * Fill in the new inode number and log the entry.
 		 */
-		INT_SET(dep->inumber, ARCH_CONVERT, inum);
+		dep->inumber = cpu_to_be64(inum);
 		xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
 		rval = 0;
 	}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index d98a41d1fe63..0cd77b17bf92 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -22,19 +22,16 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_error.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
@@ -117,13 +114,13 @@ xfs_dir2_block_sfsize(
 			dep->name[0] == '.' && dep->name[1] == '.';
 #if XFS_BIG_INUMS
 		if (!isdot)
-			i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
+			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
 #endif
 		if (!isdot && !isdotdot) {
 			count++;
 			namelen += dep->namelen;
 		} else if (isdotdot)
-			parent = INT_GET(dep->inumber, ARCH_CONVERT);
+			parent = be64_to_cpu(dep->inumber);
 		/*
 		 * Calculate the new size, see if we should give up yet.
 		 */
@@ -229,13 +226,13 @@ xfs_dir2_block_to_sf(
 		 * Skip .
 		 */
 		if (dep->namelen == 1 && dep->name[0] == '.')
-			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
+			ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
 		/*
 		 * Skip .., but make sure the inode number is right.
 		 */
 		else if (dep->namelen == 2 &&
 			 dep->name[0] == '.' && dep->name[1] == '.')
-			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
+			ASSERT(be64_to_cpu(dep->inumber) ==
 			       XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
 		/*
 		 * Normal entry, copy it into shortform.
@@ -246,7 +243,7 @@ xfs_dir2_block_to_sf(
 				(xfs_dir2_data_aoff_t)
 				((char *)dep - (char *)block));
 			memcpy(sfep->name, dep->name, dep->namelen);
-			temp=INT_GET(dep->inumber, ARCH_CONVERT);
+			temp = be64_to_cpu(dep->inumber);
 			XFS_DIR2_SF_PUT_INUMBER(sfp, &temp,
 				XFS_DIR2_SF_INUMBERP(sfep));
 			sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
index c626943b4112..f3fb2ffd6f5c 100644
--- a/fs/xfs/xfs_dir2_trace.c
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -19,11 +19,9 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_inum.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
deleted file mode 100644
index 6d711869262f..000000000000
--- a/fs/xfs/xfs_dir_leaf.c
+++ /dev/null
@@ -1,2213 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-
-/*
- * xfs_dir_leaf.c
- *
- * Routines to implement leaf blocks of directories as Btrees of hashed names.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
-					      int insertion_index,
-					      int freemap_index);
-STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
-					    int musthave, int justcheck);
-STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
-						  xfs_da_state_blk_t *blk1,
-						  xfs_da_state_blk_t *blk2);
-STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
-					  xfs_da_state_blk_t *leaf_blk_1,
-					  xfs_da_state_blk_t *leaf_blk_2,
-					  int *number_entries_in_blk1,
-					  int *number_namebytes_in_blk1);
-
-STATIC int xfs_dir_leaf_create(struct xfs_da_args *args,
-				xfs_dablk_t which_block,
-				struct xfs_dabuf **bpp);
-
-/*
- * Utility routines.
- */
-STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
-					      int src_start,
-					      xfs_dir_leafblock_t *dst_leaf,
-					      int dst_start, int move_count,
-					      xfs_mount_t *mp);
-
-
-/*========================================================================
- * External routines when dirsize < XFS_IFORK_DSIZE(dp).
- *========================================================================*/
-
-
-/*
- * Validate a given inode number.
- */
-int
-xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
-{
-	xfs_agblock_t	agblkno;
-	xfs_agino_t	agino;
-	xfs_agnumber_t	agno;
-	int		ino_ok;
-	int		ioff;
-
-	agno = XFS_INO_TO_AGNO(mp, ino);
-	agblkno = XFS_INO_TO_AGBNO(mp, ino);
-	ioff = XFS_INO_TO_OFFSET(mp, ino);
-	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
-	ino_ok =
-		agno < mp->m_sb.sb_agcount &&
-		agblkno < mp->m_sb.sb_agblocks &&
-		agblkno != 0 &&
-		ioff < (1 << mp->m_sb.sb_inopblog) &&
-		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
-	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
-			XFS_RANDOM_DIR_INO_VALIDATE))) {
-		xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
-				(unsigned long long) ino);
-		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
-	return 0;
-}
-
-/*
- * Create the initial contents of a shortform directory.
- */
-int
-xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
-{
-	xfs_dir_sf_hdr_t *hdr;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp != NULL);
-	ASSERT(dp->i_d.di_size == 0);
-	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
-		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
-		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
-		dp->i_df.if_flags |= XFS_IFINLINE;
-	}
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	ASSERT(dp->i_df.if_bytes == 0);
-	xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
-	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	XFS_DIR_SF_PUT_DIRINO(&parent, &hdr->parent);
-
-	hdr->count = 0;
-	dp->i_d.di_size = sizeof(*hdr);
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-	return 0;
-}
-
-/*
- * Add a name to the shortform directory structure.
- * Overflow from the inode has already been checked for.
- */
-int
-xfs_dir_shortform_addname(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int i, offset, size;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		if (sfe->namelen == args->namelen &&
-		    args->name[0] == sfe->name[0] &&
-		    memcmp(args->name, sfe->name, args->namelen) == 0)
-			return XFS_ERROR(EEXIST);
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-
-	offset = (int)((char *)sfe - (char *)sf);
-	size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
-	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
-
-	XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
-	sfe->namelen = args->namelen;
-	memcpy(sfe->name, args->name, sfe->namelen);
-	sf->hdr.count++;
-
-	dp->i_d.di_size += size;
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-
-	return 0;
-}
-
-/*
- * Remove a name from the shortform directory structure.
- */
-int
-xfs_dir_shortform_removename(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int base, size = 0, i;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	base = sizeof(xfs_dir_sf_hdr_t);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
-		if (sfe->namelen == args->namelen &&
-		    sfe->name[0] == args->name[0] &&
-		    memcmp(sfe->name, args->name, args->namelen) == 0)
-			break;
-		base += size;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	if (i < 0) {
-		ASSERT(args->oknoent);
-		return XFS_ERROR(ENOENT);
-	}
-
-	if ((base + size) != dp->i_d.di_size) {
-		memmove(&((char *)sf)[base], &((char *)sf)[base+size],
-					      dp->i_d.di_size - (base+size));
-	}
-	sf->hdr.count--;
-
-	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
-	dp->i_d.di_size -= size;
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-
-	return 0;
-}
-
-/*
- * Look up a name in a shortform directory structure.
- */
-int
-xfs_dir_shortform_lookup(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int i;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	if (args->namelen == 2 &&
-	    args->name[0] == '.' && args->name[1] == '.') {
-		XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &args->inumber);
-		return(XFS_ERROR(EEXIST));
-	}
-	if (args->namelen == 1 && args->name[0] == '.') {
-		args->inumber = dp->i_ino;
-		return(XFS_ERROR(EEXIST));
-	}
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		if (sfe->namelen == args->namelen &&
-		    sfe->name[0] == args->name[0] &&
-		    memcmp(args->name, sfe->name, args->namelen) == 0) {
-			XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args->inumber);
-			return(XFS_ERROR(EEXIST));
-		}
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	ASSERT(args->oknoent);
-	return(XFS_ERROR(ENOENT));
-}
-
-/*
- * Convert from using the shortform to the leaf.
- */
-int
-xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
-{
-	xfs_inode_t *dp;
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	xfs_da_args_t args;
-	xfs_ino_t inumber;
-	char *tmpbuffer;
-	int retval, i, size;
-	xfs_dablk_t blkno;
-	xfs_dabuf_t *bp;
-
-	dp = iargs->dp;
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	size = dp->i_df.if_bytes;
-	tmpbuffer = kmem_alloc(size, KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
-
-	memcpy(tmpbuffer, dp->i_df.if_u1.if_data, size);
-
-	sf = (xfs_dir_shortform_t *)tmpbuffer;
-	XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &inumber);
-
-	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
-	dp->i_d.di_size = 0;
-	xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
-	retval = xfs_da_grow_inode(iargs, &blkno);
-	if (retval)
-		goto out;
-
-	ASSERT(blkno == 0);
-	retval = xfs_dir_leaf_create(iargs, blkno, &bp);
-	if (retval)
-		goto out;
-	xfs_da_buf_done(bp);
-
-	args.name = ".";
-	args.namelen = 1;
-	args.hashval = xfs_dir_hash_dot;
-	args.inumber = dp->i_ino;
-	args.dp = dp;
-	args.firstblock = iargs->firstblock;
-	args.flist = iargs->flist;
-	args.total = iargs->total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = iargs->trans;
-	args.justcheck = 0;
-	args.addname = args.oknoent = 1;
-	retval = xfs_dir_leaf_addname(&args);
-	if (retval)
-		goto out;
-
-	args.name = "..";
-	args.namelen = 2;
-	args.hashval = xfs_dir_hash_dotdot;
-	args.inumber = inumber;
-	retval = xfs_dir_leaf_addname(&args);
-	if (retval)
-		goto out;
-
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count; i++) {
-		args.name = (char *)(sfe->name);
-		args.namelen = sfe->namelen;
-		args.hashval = xfs_da_hashname((char *)(sfe->name),
-					       sfe->namelen);
-		XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args.inumber);
-		retval = xfs_dir_leaf_addname(&args);
-		if (retval)
-			goto out;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	retval = 0;
-
-out:
-	kmem_free(tmpbuffer, size);
-	return retval;
-}
-
-STATIC int
-xfs_dir_shortform_compare(const void *a, const void *b)
-{
-	xfs_dir_sf_sort_t *sa, *sb;
-
-	sa = (xfs_dir_sf_sort_t *)a;
-	sb = (xfs_dir_sf_sort_t *)b;
-	if (sa->hash < sb->hash)
-		return -1;
-	else if (sa->hash > sb->hash)
-		return 1;
-	else
-		return sa->entno - sb->entno;
-}
-
-/*
- * Copy out directory entries for getdents(), for shortform directories.
- */
-/*ARGSUSED*/
-int
-xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
-				       xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
-	xfs_mount_t *mp;
-	xfs_dahash_t cookhash, hash;
-	xfs_dir_put_args_t p;
-	xfs_dir_sf_sort_t *sbuf, *sbp;
-
-	mp = dp->i_mount;
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
-	nsbuf = sf->hdr.count + 2;
-	sbsize = (nsbuf + 1) * sizeof(*sbuf);
-	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
-
-	xfs_dir_trace_g_du("sf: start", dp, uio);
-
-	/*
-	 * Collect all the entries into the buffer.
-	 * Entry 0 is .
-	 */
-	sbp->entno = 0;
-	sbp->seqno = 0;
-	sbp->hash = xfs_dir_hash_dot;
-	sbp->ino = dp->i_ino;
-	sbp->name = ".";
-	sbp->namelen = 1;
-	sbp++;
-
-	/*
-	 * Entry 1 is ..
-	 */
-	sbp->entno = 1;
-	sbp->seqno = 0;
-	sbp->hash = xfs_dir_hash_dotdot;
-	sbp->ino = XFS_GET_DIR_INO8(sf->hdr.parent);
-	sbp->name = "..";
-	sbp->namelen = 2;
-	sbp++;
-
-	/*
-	 * Scan the directory data for the rest of the entries.
-	 */
-	for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-
-		if (unlikely(
-		    ((char *)sfe < (char *)sf) ||
-		    ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)))) {
-			xfs_dir_trace_g_du("sf: corrupted", dp, uio);
-			XFS_CORRUPTION_ERROR("xfs_dir_shortform_getdents",
-					     XFS_ERRLEVEL_LOW, mp, sfe);
-			kmem_free(sbuf, sbsize);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-
-		sbp->entno = i + 2;
-		sbp->seqno = 0;
-		sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
-		sbp->ino = XFS_GET_DIR_INO8(sfe->inumber);
-		sbp->name = (char *)sfe->name;
-		sbp->namelen = sfe->namelen;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-		sbp++;
-	}
-
-	/*
-	 * Sort the entries on hash then entno.
-	 */
-	xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
-	/*
-	 * Stuff in last entry.
-	 */
-	sbp->entno = nsbuf;
-	sbp->hash = XFS_DA_MAXHASH;
-	sbp->seqno = 0;
-	/*
-	 * Figure out the sequence numbers in case there's a hash duplicate.
-	 */
-	for (hash = sbuf->hash, sbp = sbuf + 1;
-				sbp < &sbuf[nsbuf + 1]; sbp++) {
-		if (sbp->hash == hash)
-			sbp->seqno = sbp[-1].seqno + 1;
-		else
-			hash = sbp->hash;
-	}
-
-	/*
-	 * Set up put routine.
-	 */
-	p.dbp = dbp;
-	p.put = put;
-	p.uio = uio;
-
-	/*
-	 * Find our place.
-	 */
-	for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
-		if (sbp->hash > cookhash ||
-		    (sbp->hash == cookhash && sbp->seqno >= want_entno))
-			break;
-	}
-
-	/*
-	 * Did we fail to find anything?  We stop at the last entry,
-	 * the one we put maxhash into.
-	 */
-	if (sbp == &sbuf[nsbuf]) {
-		kmem_free(sbuf, sbsize);
-		xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
-		uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
-		*eofp = 1;
-		return 0;
-	}
-
-	/*
-	 * Loop putting entries into the user buffer.
-	 */
-	while (sbp < &sbuf[nsbuf]) {
-		/*
-		 * Save the first resid in a run of equal-hashval entries
-		 * so that we can back them out if they don't all fit.
-		 */
-		if (sbp->seqno == 0 || sbp == sbuf)
-			lastresid = uio->uio_resid;
-		XFS_PUT_COOKIE(p.cook, mp, 0, sbp[1].seqno, sbp[1].hash);
-		p.ino = sbp->ino;
-#if XFS_BIG_INUMS
-		p.ino += mp->m_inoadd;
-#endif
-		p.name = sbp->name;
-		p.namelen = sbp->namelen;
-		retval = p.put(&p);
-		if (!p.done) {
-			uio->uio_offset =
-				XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
-			kmem_free(sbuf, sbsize);
-			uio->uio_resid = lastresid;
-			xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
-			return retval;
-		}
-		sbp++;
-	}
-	kmem_free(sbuf, sbsize);
-	uio->uio_offset = p.cook.o;
-	*eofp = 1;
-	xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
-	return 0;
-}
-
-/*
- * Look up a name in a shortform directory structure, replace the inode number.
- */
-int
-xfs_dir_shortform_replace(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	xfs_inode_t *dp;
-	int i;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	if (args->namelen == 2 &&
-	    args->name[0] == '.' && args->name[1] == '.') {
-		/* XXX - replace assert? */
-		XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent);
-		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-		return 0;
-	}
-	ASSERT(args->namelen != 1 || args->name[0] != '.');
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		if (sfe->namelen == args->namelen &&
-		    sfe->name[0] == args->name[0] &&
-		    memcmp(args->name, sfe->name, args->namelen) == 0) {
-			ASSERT(memcmp((char *)&args->inumber,
-				(char *)&sfe->inumber, sizeof(xfs_ino_t)));
-			XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
-			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-			return 0;
-		}
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	ASSERT(args->oknoent);
-	return XFS_ERROR(ENOENT);
-}
-
-/*
- * Convert a leaf directory to shortform structure
- */
-int
-xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	xfs_da_args_t args;
-	xfs_inode_t *dp;
-	xfs_ino_t parent = 0;
-	char *tmpbuffer;
-	int retval, i;
-	xfs_dabuf_t *bp;
-
-	dp = iargs->dp;
-	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
-
-	retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
-					       XFS_DATA_FORK);
-	if (retval)
-		goto out;
-	ASSERT(bp != NULL);
-	memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
-	leaf = (xfs_dir_leafblock_t *)tmpbuffer;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
-
-	/*
-	 * Find and special case the parent inode number
-	 */
-	hdr = &leaf->hdr;
-	entry = &leaf->entries[0];
-	for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-		if ((entry->namelen == 2) &&
-		    (namest->name[0] == '.') &&
-		    (namest->name[1] == '.')) {
-			XFS_DIR_SF_GET_DIRINO(&namest->inumber, &parent);
-			entry->nameidx = 0;
-		} else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
-			entry->nameidx = 0;
-		}
-	}
-	retval = xfs_da_shrink_inode(iargs, 0, bp);
-	if (retval)
-		goto out;
-	retval = xfs_dir_shortform_create(iargs, parent);
-	if (retval)
-		goto out;
-
-	/*
-	 * Copy the rest of the filenames
-	 */
-	entry = &leaf->entries[0];
-	args.dp = dp;
-	args.firstblock = iargs->firstblock;
-	args.flist = iargs->flist;
-	args.total = iargs->total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = iargs->trans;
-	args.justcheck = 0;
-	args.addname = args.oknoent = 1;
-	for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
-		if (!entry->nameidx)
-			continue;
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-		args.name = (char *)(namest->name);
-		args.namelen = entry->namelen;
-		args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
-		XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber);
-		xfs_dir_shortform_addname(&args);
-	}
-
-out:
-	kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
-	return retval;
-}
-
-/*
- * Convert from using a single leaf to a root node and a leaf.
- */
-int
-xfs_dir_leaf_to_node(xfs_da_args_t *args)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_da_intnode_t *node;
-	xfs_inode_t *dp;
-	xfs_dabuf_t *bp1, *bp2;
-	xfs_dablk_t blkno;
-	int retval;
-
-	dp = args->dp;
-	retval = xfs_da_grow_inode(args, &blkno);
-	ASSERT(blkno == 1);
-	if (retval)
-		return retval;
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
-					      XFS_DATA_FORK);
-	if (retval)
-		return retval;
-	ASSERT(bp1 != NULL);
-	retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
-					     XFS_DATA_FORK);
-	if (retval) {
-		xfs_da_buf_done(bp1);
-		return retval;
-	}
-	ASSERT(bp2 != NULL);
-	memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
-	xfs_da_buf_done(bp1);
-	xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
-
-	/*
-	 * Set up the new root node.
-	 */
-	retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
-	if (retval) {
-		xfs_da_buf_done(bp2);
-		return retval;
-	}
-	node = bp1->data;
-	leaf = bp2->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	node->btree[0].hashval = cpu_to_be32(
-		INT_GET(leaf->entries[
-			INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
-	xfs_da_buf_done(bp2);
-	node->btree[0].before = cpu_to_be32(blkno);
-	node->hdr.count = cpu_to_be16(1);
-	xfs_da_log_buf(args->trans, bp1,
-		XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
-	xfs_da_buf_done(bp1);
-
-	return retval;
-}
-
-
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-
-/*
- * Create the initial contents of a leaf directory
- * or a leaf in a node directory.
- */
-STATIC int
-xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_inode_t *dp;
-	xfs_dabuf_t *bp;
-	int retval;
-
-	dp = args->dp;
-	ASSERT(dp != NULL);
-	retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
-	if (retval)
-		return retval;
-	ASSERT(bp != NULL);
-	leaf = bp->data;
-	memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
-	hdr = &leaf->hdr;
-	hdr->info.magic = cpu_to_be16(XFS_DIR_LEAF_MAGIC);
-	INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
-	if (!hdr->firstused)
-		INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
-	INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
-	INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
-
-	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
-
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Split the leaf node, rebalance, then add the new entry.
- */
-int
-xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
-				  xfs_da_state_blk_t *newblk)
-{
-	xfs_dablk_t blkno;
-	xfs_da_args_t *args;
-	int error;
-
-	/*
-	 * Allocate space for a new leaf node.
-	 */
-	args = state->args;
-	ASSERT(args != NULL);
-	ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
-	error = xfs_da_grow_inode(args, &blkno);
-	if (error)
-		return error;
-	error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
-	if (error)
-		return error;
-	newblk->blkno = blkno;
-	newblk->magic = XFS_DIR_LEAF_MAGIC;
-
-	/*
-	 * Rebalance the entries across the two leaves.
-	 */
-	xfs_dir_leaf_rebalance(state, oldblk, newblk);
-	error = xfs_da_blk_link(state, oldblk, newblk);
-	if (error)
-		return error;
-
-	/*
-	 * Insert the new entry in the correct block.
-	 */
-	if (state->inleaf) {
-		error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
-	} else {
-		error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
-	}
-
-	/*
-	 * Update last hashval in each block since we added the name.
-	 */
-	oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
-	newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
-	return error;
-}
-
-/*
- * Add a name to the leaf directory structure.
- *
- * Must take into account fragmented leaves and leaves where spacemap has
- * lost some freespace information (ie: holes).
- */
-int
-xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_map_t *map;
-	int tablesize, entsize, sum, i, tmp, error;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
-	hdr = &leaf->hdr;
-	entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
-
-	/*
-	 * Search through freemap for first-fit on new name length.
-	 * (may need to figure in size of entry struct too)
-	 */
-	tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
-			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-	map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
-	for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
-		if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
-			sum += INT_GET(map->size, ARCH_CONVERT);
-			continue;
-		}
-		if (!map->size)
-			continue;	/* no space in this map */
-		tmp = entsize;
-		if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
-			tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
-		if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
-			if (!args->justcheck)
-				xfs_dir_leaf_add_work(bp, args, index, i);
-			return 0;
-		}
-		sum += INT_GET(map->size, ARCH_CONVERT);
-	}
-
-	/*
-	 * If there are no holes in the address space of the block,
-	 * and we don't have enough freespace, then compaction will do us
-	 * no good and we should just give up.
-	 */
-	if (!hdr->holes && (sum < entsize))
-		return XFS_ERROR(ENOSPC);
-
-	/*
-	 * Compact the entries to coalesce free space.
-	 * Pass the justcheck flag so the checking pass can return
-	 * an error, without changing anything, if it won't fit.
-	 */
-	error = xfs_dir_leaf_compact(args->trans, bp,
-			args->total == 0 ?
-				entsize +
-				(uint)sizeof(xfs_dir_leaf_entry_t) : 0,
-			args->justcheck);
-	if (error)
-		return error;
-	/*
-	 * After compaction, the block is guaranteed to have only one
-	 * free region, in freemap[0].  If it is not big enough, give up.
-	 */
-	if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
-	    (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
-		return XFS_ERROR(ENOSPC);
-
-	if (!args->justcheck)
-		xfs_dir_leaf_add_work(bp, args, index, 0);
-	return 0;
-}
-
-/*
- * Add a name to a leaf directory structure.
- */
-STATIC void
-xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
-		      int mapindex)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	xfs_dir_leaf_map_t *map;
-	/* REFERENCED */
-	xfs_mount_t *mp;
-	int tmp, i;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	hdr = &leaf->hdr;
-	ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
-	ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
-
-	/*
-	 * Force open some space in the entry array and fill it in.
-	 */
-	entry = &leaf->entries[index];
-	if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
-		tmp  = INT_GET(hdr->count, ARCH_CONVERT) - index;
-		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-		memmove(entry + 1, entry, tmp);
-		xfs_da_log_buf(args->trans, bp,
-		    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
-	}
-	INT_MOD(hdr->count, ARCH_CONVERT, +1);
-
-	/*
-	 * Allocate space for the new string (at the end of the run).
-	 */
-	map = &hdr->freemap[mapindex];
-	mp = args->trans->t_mountp;
-	ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
-	ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
-	ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
-	INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
-	INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
-	INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
-	entry->namelen = args->namelen;
-	xfs_da_log_buf(args->trans, bp,
-	    XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-
-	/*
-	 * Copy the string and inode number into the new space.
-	 */
-	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-	XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber);
-	memcpy(namest->name, args->name, args->namelen);
-	xfs_da_log_buf(args->trans, bp,
-	    XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
-
-	/*
-	 * Update the control info for this leaf node
-	 */
-	if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
-		INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
-	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
-	tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
-			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-	map = &hdr->freemap[0];
-	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-		if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
-			INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
-			INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
-		}
-	}
-	INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
-	xfs_da_log_buf(args->trans, bp,
-		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-}
-
-/*
- * Garbage collect a leaf directory block by copying it to a new buffer.
- */
-STATIC int
-xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
-		     int justcheck)
-{
-	xfs_dir_leafblock_t *leaf_s, *leaf_d;
-	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
-	xfs_mount_t *mp;
-	char *tmpbuffer;
-	char *tmpbuffer2=NULL;
-	int rval;
-	int lbsize;
-
-	mp = trans->t_mountp;
-	lbsize = XFS_LBSIZE(mp);
-	tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
-	memcpy(tmpbuffer, bp->data, lbsize);
-
-	/*
-	 * Make a second copy in case xfs_dir_leaf_moveents()
-	 * below destroys the original.
-	 */
-	if (musthave || justcheck) {
-		tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
-		memcpy(tmpbuffer2, bp->data, lbsize);
-	}
-	memset(bp->data, 0, lbsize);
-
-	/*
-	 * Copy basic information
-	 */
-	leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
-	leaf_d = bp->data;
-	hdr_s = &leaf_s->hdr;
-	hdr_d = &leaf_d->hdr;
-	hdr_d->info = hdr_s->info;	/* struct copy */
-	INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
-	if (!hdr_d->firstused)
-		INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
-	hdr_d->namebytes = 0;
-	hdr_d->count = 0;
-	hdr_d->holes = 0;
-	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
-	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
-
-	/*
-	 * Copy all entry's in the same (sorted) order,
-	 * but allocate filenames packed and in sequence.
-	 * This changes the source (leaf_s) as well.
-	 */
-	xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
-
-	if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
-		rval = XFS_ERROR(ENOSPC);
-	else
-		rval = 0;
-
-	if (justcheck || rval == ENOSPC) {
-		ASSERT(tmpbuffer2);
-		memcpy(bp->data, tmpbuffer2, lbsize);
-	} else {
-		xfs_da_log_buf(trans, bp, 0, lbsize - 1);
-	}
-
-	kmem_free(tmpbuffer, lbsize);
-	if (musthave || justcheck)
-		kmem_free(tmpbuffer2, lbsize);
-	return rval;
-}
-
-/*
- * Redistribute the directory entries between two leaf nodes,
- * taking into account the size of the new entry.
- *
- * NOTE: if new block is empty, then it will get the upper half of old block.
- */
-STATIC void
-xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
-				      xfs_da_state_blk_t *blk2)
-{
-	xfs_da_state_blk_t *tmp_blk;
-	xfs_dir_leafblock_t *leaf1, *leaf2;
-	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
-	int count, totallen, max, space, swap;
-
-	/*
-	 * Set up environment.
-	 */
-	ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
-	ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
-	leaf1 = blk1->bp->data;
-	leaf2 = blk2->bp->data;
-	ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-
-	/*
-	 * Check ordering of blocks, reverse if it makes things simpler.
-	 */
-	swap = 0;
-	if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
-		tmp_blk = blk1;
-		blk1 = blk2;
-		blk2 = tmp_blk;
-		leaf1 = blk1->bp->data;
-		leaf2 = blk2->bp->data;
-		swap = 1;
-	}
-	hdr1 = &leaf1->hdr;
-	hdr2 = &leaf2->hdr;
-
-	/*
-	 * Examine entries until we reduce the absolute difference in
-	 * byte usage between the two blocks to a minimum.  Then get
-	 * the direction to copy and the number of elements to move.
-	 */
-	state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
-							   &count, &totallen);
-	if (swap)
-		state->inleaf = !state->inleaf;
-
-	/*
-	 * Move any entries required from leaf to leaf:
-	 */
-	if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
-		/*
-		 * Figure the total bytes to be added to the destination leaf.
-		 */
-		count = INT_GET(hdr1->count, ARCH_CONVERT) - count;	/* number entries being moved */
-		space  = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
-		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
-		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-
-		/*
-		 * leaf2 is the destination, compact it if it looks tight.
-		 */
-		max  = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-		max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
-		if (space > max) {
-			xfs_dir_leaf_compact(state->args->trans, blk2->bp,
-								 0, 0);
-		}
-
-		/*
-		 * Move high entries from leaf1 to low end of leaf2.
-		 */
-		xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
-					     leaf2, 0, count, state->mp);
-
-		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
-						   state->blocksize-1);
-		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
-						   state->blocksize-1);
-
-	} else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
-		/*
-		 * Figure the total bytes to be added to the destination leaf.
-		 */
-		count -= INT_GET(hdr1->count, ARCH_CONVERT);		/* number entries being moved */
-		space  = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
-		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
-		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-
-		/*
-		 * leaf1 is the destination, compact it if it looks tight.
-		 */
-		max  = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-		max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
-		if (space > max) {
-			xfs_dir_leaf_compact(state->args->trans, blk1->bp,
-								 0, 0);
-		}
-
-		/*
-		 * Move low entries from leaf2 to high end of leaf1.
-		 */
-		xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
-					     count, state->mp);
-
-		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
-						   state->blocksize-1);
-		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
-						   state->blocksize-1);
-	}
-
-	/*
-	 * Copy out last hashval in each block for B-tree code.
-	 */
-	blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-	blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-
-	/*
-	 * Adjust the expected index for insertion.
-	 * GROT: this doesn't work unless blk2 was originally empty.
-	 */
-	if (!state->inleaf) {
-		blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
-	}
-}
-
-/*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum.
- * GROT: Is this really necessary?  With other than a 512 byte blocksize,
- * GROT: there will always be enough room in either block for a new entry.
- * GROT: Do a double-split for this case?
- */
-STATIC int
-xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
-					   xfs_da_state_blk_t *blk1,
-					   xfs_da_state_blk_t *blk2,
-					   int *countarg, int *namebytesarg)
-{
-	xfs_dir_leafblock_t *leaf1, *leaf2;
-	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
-	xfs_dir_leaf_entry_t *entry;
-	int count, max, totallen, half;
-	int lastdelta, foundit, tmp;
-
-	/*
-	 * Set up environment.
-	 */
-	leaf1 = blk1->bp->data;
-	leaf2 = blk2->bp->data;
-	hdr1 = &leaf1->hdr;
-	hdr2 = &leaf2->hdr;
-	foundit = 0;
-	totallen = 0;
-
-	/*
-	 * Examine entries until we reduce the absolute difference in
-	 * byte usage between the two blocks to a minimum.
-	 */
-	max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
-	half  = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
-	half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
-	half /= 2;
-	lastdelta = state->blocksize;
-	entry = &leaf1->entries[0];
-	for (count = 0; count < max; entry++, count++) {
-
-#define XFS_DIR_ABS(A)	(((A) < 0) ? -(A) : (A))
-		/*
-		 * The new entry is in the first block, account for it.
-		 */
-		if (count == blk1->index) {
-			tmp = totallen + (uint)sizeof(*entry)
-				+ XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
-			if (XFS_DIR_ABS(half - tmp) > lastdelta)
-				break;
-			lastdelta = XFS_DIR_ABS(half - tmp);
-			totallen = tmp;
-			foundit = 1;
-		}
-
-		/*
-		 * Wrap around into the second block if necessary.
-		 */
-		if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
-			leaf1 = leaf2;
-			entry = &leaf1->entries[0];
-		}
-
-		/*
-		 * Figure out if next leaf entry would be too much.
-		 */
-		tmp = totallen + (uint)sizeof(*entry)
-				+ XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
-		if (XFS_DIR_ABS(half - tmp) > lastdelta)
-			break;
-		lastdelta = XFS_DIR_ABS(half - tmp);
-		totallen = tmp;
-#undef XFS_DIR_ABS
-	}
-
-	/*
-	 * Calculate the number of namebytes that will end up in lower block.
-	 * If new entry not in lower block, fix up the count.
-	 */
-	totallen -=
-		count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
-	if (foundit) {
-		totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
-			    state->args->namelen;
-	}
-
-	*countarg = count;
-	*namebytesarg = totallen;
-	return foundit;
-}
-
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-int
-xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_da_state_blk_t *blk;
-	xfs_da_blkinfo_t *info;
-	int count, bytes, forward, error, retval, i;
-	xfs_dablk_t blkno;
-	xfs_dabuf_t *bp;
-
-	/*
-	 * Check for the degenerate case of the block being over 50% full.
-	 * If so, it's not worth even looking to see if we might be able
-	 * to coalesce with a sibling.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	info = blk->bp->data;
-	ASSERT(be16_to_cpu(info->magic) == XFS_DIR_LEAF_MAGIC);
-	leaf = (xfs_dir_leafblock_t *)info;
-	count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-	bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
-		count * (uint)sizeof(xfs_dir_leaf_entry_t) +
-		count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
-		INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-	if (bytes > (state->blocksize >> 1)) {
-		*action = 0;	/* blk over 50%, don't try to join */
-		return 0;
-	}
-
-	/*
-	 * Check for the degenerate case of the block being empty.
-	 * If the block is empty, we'll simply delete it, no need to
-	 * coalesce it with a sibling block.  We choose (arbitrarily)
-	 * to merge with the forward block unless it is NULL.
-	 */
-	if (count == 0) {
-		/*
-		 * Make altpath point to the block we want to keep and
-		 * path point to the block we want to drop (this one).
-		 */
-		forward = (info->forw != 0);
-		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-		if (error)
-			return error;
-		if (retval) {
-			*action = 0;
-		} else {
-			*action = 2;
-		}
-		return 0;
-	}
-
-	/*
-	 * Examine each sibling block to see if we can coalesce with
-	 * at least 25% free space to spare.  We need to figure out
-	 * whether to merge with the forward or the backward block.
-	 * We prefer coalescing with the lower numbered sibling so as
-	 * to shrink a directory over time.
-	 */
-	forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));	/* start with smaller blk num */
-	for (i = 0; i < 2; forward = !forward, i++) {
-		if (forward)
-			blkno = be32_to_cpu(info->forw);
-		else
-			blkno = be32_to_cpu(info->back);
-		if (blkno == 0)
-			continue;
-		error = xfs_da_read_buf(state->args->trans, state->args->dp,
-							    blkno, -1, &bp,
-							    XFS_DATA_FORK);
-		if (error)
-			return error;
-		ASSERT(bp != NULL);
-
-		leaf = (xfs_dir_leafblock_t *)info;
-		count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-		bytes  = state->blocksize - (state->blocksize>>2);
-		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-		leaf = bp->data;
-		ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-		count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
-		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-		bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
-		bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
-		bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
-		if (bytes >= 0)
-			break;	/* fits with at least 25% to spare */
-
-		xfs_da_brelse(state->args->trans, bp);
-	}
-	if (i >= 2) {
-		*action = 0;
-		return 0;
-	}
-	xfs_da_buf_done(bp);
-
-	/*
-	 * Make altpath point to the block we want to keep (the lower
-	 * numbered block) and path point to the block we want to drop.
-	 */
-	memcpy(&state->altpath, &state->path, sizeof(state->path));
-	if (blkno < blk->blkno) {
-		error = xfs_da_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-	} else {
-		error = xfs_da_path_shift(state, &state->path, forward,
-						 0, &retval);
-	}
-	if (error)
-		return error;
-	if (retval) {
-		*action = 0;
-	} else {
-		*action = 1;
-	}
-	return 0;
-}
-
-/*
- * Remove a name from the leaf directory structure.
- *
- * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
- * If two leaves are 37% full, when combined they will leave 25% free.
- */
-int
-xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_map_t *map;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	int before, after, smallest, entsize;
-	int tablesize, tmp, i;
-	xfs_mount_t *mp;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	hdr = &leaf->hdr;
-	mp = trans->t_mountp;
-	ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
-	ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
-	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
-	entry = &leaf->entries[index];
-	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
-	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
-
-	/*
-	 * Scan through free region table:
-	 *    check for adjacency of free'd entry with an existing one,
-	 *    find smallest free region in case we need to replace it,
-	 *    adjust any map that borders the entry table,
-	 */
-	tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
-			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-	map = &hdr->freemap[0];
-	tmp = INT_GET(map->size, ARCH_CONVERT);
-	before = after = -1;
-	smallest = XFS_DIR_LEAF_MAPSIZE - 1;
-	entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
-	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-		ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
-		ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
-		if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
-			INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
-			INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
-		}
-
-		if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
-			before = i;
-		} else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
-			after = i;
-		} else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
-			tmp = INT_GET(map->size, ARCH_CONVERT);
-			smallest = i;
-		}
-	}
-
-	/*
-	 * Coalesce adjacent freemap regions,
-	 * or replace the smallest region.
-	 */
-	if ((before >= 0) || (after >= 0)) {
-		if ((before >= 0) && (after >= 0)) {
-			map = &hdr->freemap[before];
-			INT_MOD(map->size, ARCH_CONVERT, entsize);
-			INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
-			hdr->freemap[after].base = 0;
-			hdr->freemap[after].size = 0;
-		} else if (before >= 0) {
-			map = &hdr->freemap[before];
-			INT_MOD(map->size, ARCH_CONVERT, entsize);
-		} else {
-			map = &hdr->freemap[after];
-			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
-			INT_MOD(map->size, ARCH_CONVERT, entsize);
-		}
-	} else {
-		/*
-		 * Replace smallest region (if it is smaller than free'd entry)
-		 */
-		map = &hdr->freemap[smallest];
-		if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
-			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
-			INT_SET(map->size, ARCH_CONVERT, entsize);
-		}
-	}
-
-	/*
-	 * Did we remove the first entry?
-	 */
-	if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
-		smallest = 1;
-	else
-		smallest = 0;
-
-	/*
-	 * Compress the remaining entries and zero out the removed stuff.
-	 */
-	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-	memset((char *)namest, 0, entsize);
-	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
-
-	INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
-	tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
-	memmove(entry, entry + 1, tmp);
-	INT_MOD(hdr->count, ARCH_CONVERT, -1);
-	xfs_da_log_buf(trans, bp,
-	    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
-	entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
-	memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t));
-
-	/*
-	 * If we removed the first entry, re-find the first used byte
-	 * in the name area.  Note that if the entry was the "firstused",
-	 * then we don't have a "hole" in our block resulting from
-	 * removing the name.
-	 */
-	if (smallest) {
-		tmp = XFS_LBSIZE(mp);
-		entry = &leaf->entries[0];
-		for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
-			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
-			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
-			if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
-				tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
-		}
-		INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
-		if (!hdr->firstused)
-			INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
-	} else {
-		hdr->holes = 1;		/* mark as needing compaction */
-	}
-
-	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-
-	/*
-	 * Check if leaf is less than 50% full, caller may want to
-	 * "join" the leaf with a sibling if so.
-	 */
-	tmp  = (uint)sizeof(xfs_dir_leaf_hdr_t);
-	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
-	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
-	tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-	if (tmp < mp->m_dir_magicpct)
-		return 1;			/* leaf is < 37% full */
-	return 0;
-}
-
-/*
- * Move all the directory entries from drop_leaf into save_leaf.
- */
-void
-xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
-				      xfs_da_state_blk_t *save_blk)
-{
-	xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
-	xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
-	xfs_mount_t *mp;
-	char *tmpbuffer;
-
-	/*
-	 * Set up environment.
-	 */
-	mp = state->mp;
-	ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
-	ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
-	drop_leaf = drop_blk->bp->data;
-	save_leaf = save_blk->bp->data;
-	ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	drop_hdr = &drop_leaf->hdr;
-	save_hdr = &save_leaf->hdr;
-
-	/*
-	 * Save last hashval from dying block for later Btree fixup.
-	 */
-	drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
-
-	/*
-	 * Check if we need a temp buffer, or can we do it in place.
-	 * Note that we don't check "leaf" for holes because we will
-	 * always be dropping it, toosmall() decided that for us already.
-	 */
-	if (save_hdr->holes == 0) {
-		/*
-		 * dest leaf has no holes, so we add there.  May need
-		 * to make some room in the entry array.
-		 */
-		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
-			xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
-						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-		} else {
-			xfs_dir_leaf_moveents(drop_leaf, 0,
-					      save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
-					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-		}
-	} else {
-		/*
-		 * Destination has holes, so we make a temporary copy
-		 * of the leaf and add them both to that.
-		 */
-		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
-		ASSERT(tmpbuffer != NULL);
-		memset(tmpbuffer, 0, state->blocksize);
-		tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
-		tmp_hdr = &tmp_leaf->hdr;
-		tmp_hdr->info = save_hdr->info;	/* struct copy */
-		tmp_hdr->count = 0;
-		INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
-		if (!tmp_hdr->firstused)
-			INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
-		tmp_hdr->namebytes = 0;
-		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
-			xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
-						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-			xfs_dir_leaf_moveents(save_leaf, 0,
-					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
-					      (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
-		} else {
-			xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
-						 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
-			xfs_dir_leaf_moveents(drop_leaf, 0,
-					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
-					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-		}
-		memcpy(save_leaf, tmp_leaf, state->blocksize);
-		kmem_free(tmpbuffer, state->blocksize);
-	}
-
-	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
-					   state->blocksize - 1);
-
-	/*
-	 * Copy out last hashval in each block for B-tree code.
-	 */
-	save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-}
-
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-
-/*
- * Look up a name in a leaf directory structure.
- * This is the internal routine, it uses the caller's buffer.
- *
- * Note that duplicate keys are allowed, but only check within the
- * current leaf node.  The Btree code must check in adjacent leaf nodes.
- *
- * Return in *index the index into the entry[] array of either the found
- * entry, or where the entry should have been (insert before that entry).
- *
- * Don't change the args->inumber unless we find the filename.
- */
-int
-xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	int probe, span;
-	xfs_dahash_t hashval;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
-
-	/*
-	 * Binary search.  (note: small blocks will skip this loop)
-	 */
-	hashval = args->hashval;
-	probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
-	for (entry = &leaf->entries[probe]; span > 4;
-		   entry = &leaf->entries[probe]) {
-		span /= 2;
-		if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
-			probe += span;
-		else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
-			probe -= span;
-		else
-			break;
-	}
-	ASSERT((probe >= 0) && \
-	       ((!leaf->hdr.count) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
-	ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
-
-	/*
-	 * Since we may have duplicate hashval's, find the first matching
-	 * hashval in the leaf.
-	 */
-	while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
-		entry--;
-		probe--;
-	}
-	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
-		entry++;
-		probe++;
-	}
-	if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
-		*index = probe;
-		ASSERT(args->oknoent);
-		return XFS_ERROR(ENOENT);
-	}
-
-	/*
-	 * Duplicate keys may be present, so search all of them for a match.
-	 */
-	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-		if (entry->namelen == args->namelen &&
-		    namest->name[0] == args->name[0] &&
-		    memcmp(args->name, namest->name, args->namelen) == 0) {
-			XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber);
-			*index = probe;
-			return XFS_ERROR(EEXIST);
-		}
-		entry++;
-		probe++;
-	}
-	*index = probe;
-	ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
-	return XFS_ERROR(ENOENT);
-}
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Move the indicated entries from one leaf to another.
- * NOTE: this routine modifies both source and destination leaves.
- */
-/* ARGSUSED */
-STATIC void
-xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
-		      xfs_dir_leafblock_t *leaf_d, int start_d,
-		      int count, xfs_mount_t *mp)
-{
-	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
-	xfs_dir_leaf_entry_t *entry_s, *entry_d;
-	int tmp, i;
-
-	/*
-	 * Check for nothing to do.
-	 */
-	if (count == 0)
-		return;
-
-	/*
-	 * Set up environment.
-	 */
-	ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	hdr_s = &leaf_s->hdr;
-	hdr_d = &leaf_d->hdr;
-	ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
-	ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
-		((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
-	ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
-	ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
-		((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
-
-	ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
-	ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
-	ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
-
-	/*
-	 * Move the entries in the destination leaf up to make a hole?
-	 */
-	if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
-		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
-		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_d->entries[start_d];
-		entry_d = &leaf_d->entries[start_d + count];
-		memcpy(entry_d, entry_s, tmp);
-	}
-
-	/*
-	 * Copy all entry's in the same (sorted) order,
-	 * but allocate filenames packed and in sequence.
-	 */
-	entry_s = &leaf_s->entries[start_s];
-	entry_d = &leaf_d->entries[start_d];
-	for (i = 0; i < count; entry_s++, entry_d++, i++) {
-		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
-		tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
-		INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
-		entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
-		INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
-		entry_d->namelen = entry_s->namelen;
-		ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
-		memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)),
-		       XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), tmp);
-		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
-		memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
-		      0, tmp);
-		INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
-		INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
-		INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
-		INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
-		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
-				+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-		ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
-
-	}
-
-	/*
-	 * Zero out the entries we just copied.
-	 */
-	if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
-		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_s->entries[start_s];
-		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
-		memset((char *)entry_s, 0, tmp);
-	} else {
-		/*
-		 * Move the remaining entries down to fill the hole,
-		 * then zero the entries at the top.
-		 */
-		tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
-		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_s->entries[start_s + count];
-		entry_d = &leaf_s->entries[start_s];
-		memcpy(entry_d, entry_s, tmp);
-
-		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
-		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
-		memset((char *)entry_s, 0, tmp);
-	}
-
-	/*
-	 * Fill in the freemap information
-	 */
-	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
-	INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
-	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
-	INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, (hdr_d->freemap[2].base = 0));
-	INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, (hdr_d->freemap[2].size = 0));
-	hdr_s->holes = 1;	/* leaf may not be compact */
-}
-
-/*
- * Compare two leaf blocks "order".
- */
-int
-xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
-{
-	xfs_dir_leafblock_t *leaf1, *leaf2;
-
-	leaf1 = leaf1_bp->data;
-	leaf2 = leaf2_bp->data;
-	ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC) &&
-	       (be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC));
-	if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
-	    ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
-	      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
-	     (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
-	      INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Pick up the last hashvalue from a leaf block.
- */
-xfs_dahash_t
-xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
-{
-	xfs_dir_leafblock_t *leaf;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	if (count)
-		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-	if (!leaf->hdr.count)
-		return(0);
-	return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
-}
-
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-int
-xfs_dir_leaf_getdents_int(
-	xfs_dabuf_t	*bp,
-	xfs_inode_t	*dp,
-	xfs_dablk_t	bno,
-	uio_t		*uio,
-	int		*eobp,
-	xfs_dirent_t	*dbp,
-	xfs_dir_put_t	put,
-	xfs_daddr_t		nextda)
-{
-	xfs_dir_leafblock_t	*leaf;
-	xfs_dir_leaf_entry_t	*entry;
-	xfs_dir_leaf_name_t	*namest;
-	int			entno, want_entno, i, nextentno;
-	xfs_mount_t		*mp;
-	xfs_dahash_t		cookhash;
-	xfs_dahash_t		nexthash = 0;
-#if (BITS_PER_LONG == 32)
-	xfs_dahash_t		lasthash = XFS_DA_MAXHASH;
-#endif
-	xfs_dir_put_args_t	p;
-
-	mp = dp->i_mount;
-	leaf = bp->data;
-	if (be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC) {
-		*eobp = 1;
-		return XFS_ERROR(ENOENT);	/* XXX wrong code */
-	}
-
-	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
-
-	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-
-	xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
-
-	/*
-	 * Re-find our place.
-	 */
-	for (i = entno = 0, entry = &leaf->entries[0];
-		     i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
-			     entry++, i++) {
-
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-				    INT_GET(entry->nameidx, ARCH_CONVERT));
-
-		if (unlikely(
-		    ((char *)namest < (char *)leaf) ||
-		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
-			XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(1)",
-					     XFS_ERRLEVEL_LOW, mp, leaf);
-			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-		if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
-			if (   entno < want_entno
-			    && INT_GET(entry->hashval, ARCH_CONVERT)
-							== cookhash) {
-				/*
-				 * Trying to get to a particular offset in a
-				 * run of equal-hashval entries.
-				 */
-				entno++;
-			} else if (   want_entno > 0
-				   && entno == want_entno
-				   && INT_GET(entry->hashval, ARCH_CONVERT)
-							== cookhash) {
-				break;
-			} else {
-				entno = 0;
-				break;
-			}
-		}
-	}
-
-	if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
-		xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
-		if (!leaf->hdr.info.forw)
-			uio->uio_offset =
-				XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
-		/*
-		 * Don't set uio_offset if there's another block:
-		 * the node code will be setting uio_offset anyway.
-		 */
-		*eobp = 0;
-		return 0;
-	}
-	xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
-
-	p.dbp = dbp;
-	p.put = put;
-	p.uio = uio;
-
-	/*
-	 * We're synchronized, start copying entries out to the user.
-	 */
-	for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
-			     entry++, i++, (entno = nextentno)) {
-		int lastresid=0, retval;
-		xfs_dircook_t lastoffset;
-		xfs_dahash_t thishash;
-
-		/*
-		 * Check for a damaged directory leaf block and pick up
-		 * the inode number from this entry.
-		 */
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-				    INT_GET(entry->nameidx, ARCH_CONVERT));
-
-		if (unlikely(
-		    ((char *)namest < (char *)leaf) ||
-		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
-			XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(2)",
-					     XFS_ERRLEVEL_LOW, mp, leaf);
-			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-
-		xfs_dir_trace_g_duc("leaf: middle cookie  ",
-						   dp, uio, p.cook.o);
-
-		if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
-			nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
-
-			if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
-				nextentno = entno + 1;
-			else
-				nextentno = 0;
-			XFS_PUT_COOKIE(p.cook, mp, bno, nextentno, nexthash);
-			xfs_dir_trace_g_duc("leaf: middle cookie  ",
-						   dp, uio, p.cook.o);
-
-		} else if ((thishash = be32_to_cpu(leaf->hdr.info.forw))) {
-			xfs_dabuf_t *bp2;
-			xfs_dir_leafblock_t *leaf2;
-
-			ASSERT(nextda != -1);
-
-			retval = xfs_da_read_buf(dp->i_transp, dp, thishash,
-						 nextda, &bp2, XFS_DATA_FORK);
-			if (retval)
-				return retval;
-
-			ASSERT(bp2 != NULL);
-
-			leaf2 = bp2->data;
-
-			if (unlikely(
-			       (be16_to_cpu(leaf2->hdr.info.magic)
-						!= XFS_DIR_LEAF_MAGIC)
-			    || (be32_to_cpu(leaf2->hdr.info.back)
-						!= bno))) {	/* GROT */
-				XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(3)",
-						     XFS_ERRLEVEL_LOW, mp,
-						     leaf2);
-				xfs_da_brelse(dp->i_transp, bp2);
-
-				return XFS_ERROR(EFSCORRUPTED);
-			}
-
-			nexthash = INT_GET(leaf2->entries[0].hashval,
-								ARCH_CONVERT);
-			nextentno = -1;
-			XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash);
-			xfs_da_brelse(dp->i_transp, bp2);
-			xfs_dir_trace_g_duc("leaf: next blk cookie",
-						   dp, uio, p.cook.o);
-		} else {
-			nextentno = -1;
-			XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
-		}
-
-		/*
-		 * Save off the cookie so we can fall back should the
-		 * 'put' into the outgoing buffer fails.  To handle a run
-		 * of equal-hashvals, the off_t structure on 64bit
-		 * builds has entno built into the cookie to ID the
-		 * entry.  On 32bit builds, we only have space for the
-		 * hashval so we can't ID specific entries within a group
-		 * of same hashval entries.   For this, lastoffset is set
-		 * to the first in the run of equal hashvals so we don't
-		 * include any entries unless we can include all entries
-		 * that share the same hashval.  Hopefully the buffer
-		 * provided is big enough to handle it (see pv763517).
-		 */
-#if (BITS_PER_LONG == 32)
-		if ((thishash = INT_GET(entry->hashval, ARCH_CONVERT))
-								!= lasthash) {
-			XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
-			lastresid = uio->uio_resid;
-			lasthash = thishash;
-		} else {
-			xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
-						   dp, uio, p.cook.o);
-		}
-#else
-		thishash = INT_GET(entry->hashval, ARCH_CONVERT);
-		XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
-		lastresid = uio->uio_resid;
-#endif /* BITS_PER_LONG == 32 */
-
-		/*
-		 * Put the current entry into the outgoing buffer.  If we fail
-		 * then restore the UIO to the first entry in the current
-		 * run of equal-hashval entries (probably one 1 entry long).
-		 */
-		p.ino = XFS_GET_DIR_INO8(namest->inumber);
-#if XFS_BIG_INUMS
-		p.ino += mp->m_inoadd;
-#endif
-		p.name = (char *)namest->name;
-		p.namelen = entry->namelen;
-
-		retval = p.put(&p);
-
-		if (!p.done) {
-			uio->uio_offset = lastoffset.o;
-			uio->uio_resid = lastresid;
-
-			*eobp = 1;
-
-			xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
-
-			return retval;
-		}
-	}
-
-	uio->uio_offset = p.cook.o;
-
-	*eobp = 0;
-
-	xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
-
-	return 0;
-}
-
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
-{
-	iovec_t *iovp;
-	int reclen, namelen;
-	xfs_dirent_t *idbp;
-	uio_t *uio;
-
-	namelen = pa->namelen;
-	reclen = DIRENTSIZE(namelen);
-	uio = pa->uio;
-	if (reclen > uio->uio_resid) {
-		pa->done = 0;
-		return 0;
-	}
-	iovp = uio->uio_iov;
-	idbp = (xfs_dirent_t *)iovp->iov_base;
-	iovp->iov_base = (char *)idbp + reclen;
-	iovp->iov_len -= reclen;
-	uio->uio_resid -= reclen;
-	idbp->d_reclen = reclen;
-	idbp->d_ino = pa->ino;
-	idbp->d_off = pa->cook.o;
-	idbp->d_name[namelen] = '\0';
-	pa->done = 1;
-	memcpy(idbp->d_name, pa->name, namelen);
-	return 0;
-}
-
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
-{
-	int		retval, reclen, namelen;
-	xfs_dirent_t	*idbp;
-	uio_t		*uio;
-
-	namelen = pa->namelen;
-	reclen = DIRENTSIZE(namelen);
-	uio = pa->uio;
-	if (reclen > uio->uio_resid) {
-		pa->done = 0;
-		return 0;
-	}
-	idbp = pa->dbp;
-	idbp->d_reclen = reclen;
-	idbp->d_ino = pa->ino;
-	idbp->d_off = pa->cook.o;
-	idbp->d_name[namelen] = '\0';
-	memcpy(idbp->d_name, pa->name, namelen);
-	retval = uio_read((caddr_t)idbp, reclen, uio);
-	pa->done = (retval == 0);
-	return retval;
-}
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
deleted file mode 100644
index eb8cd9a4667f..000000000000
--- a/fs/xfs/xfs_dir_leaf.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_LEAF_H__
-#define	__XFS_DIR_LEAF_H__
-
-/*
- * Directory layout, internal structure, access macros, etc.
- *
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes.  Filenames are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of a filename may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- */
-
-struct uio;
-struct xfs_bmap_free;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_da_state;
-struct xfs_da_state_blk;
-struct xfs_dir_put_args;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*========================================================================
- * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top.  Names grow from the bottom
- * but are not packed.  The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped.  When the freemap doesn't show enough space
- * for an allocation, we compact the namelist area and try again.  If we
- * still don't have enough space, then we have to split the block.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual string.  The root and intermediate node search always takes
- * the first-in-the-block key match found, so we should only have to work
- * "forw"ard.  If none matches, continue with the "forw"ard leaf nodes
- * until the hash key changes or the filename is found.
- *
- * The parent directory and the self-pointer are explicitly represented
- * (ie: there are entries for "." and "..").
- *
- * Note that the count being a __uint16_t limits us to something like a
- * blocksize of 1.3MB in the face of worst case (short) filenames.
- */
-#define XFS_DIR_LEAF_MAPSIZE	3	/* how many freespace slots */
-
-typedef struct xfs_dir_leaf_map {	/* RLE map of free bytes */
-	__uint16_t	base;	 	/* base of free region */
-	__uint16_t	size; 		/* run length of free region */
-} xfs_dir_leaf_map_t;
-
-typedef struct xfs_dir_leaf_hdr {	/* constant-structure header block */
-	xfs_da_blkinfo_t info;		/* block type, links, etc. */
-	__uint16_t	count;		/* count of active leaf_entry's */
-	__uint16_t	namebytes;	/* num bytes of name strings stored */
-	__uint16_t	firstused;	/* first used byte in name area */
-	__uint8_t	holes;		/* != 0 if blk needs compaction */
-	__uint8_t	pad1;
-	xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE];
-} xfs_dir_leaf_hdr_t;
-
-typedef struct xfs_dir_leaf_entry {	/* sorted on key, not name */
-	xfs_dahash_t	hashval;	/* hash value of name */
-	__uint16_t	nameidx;	/* index into buffer of name */
-	__uint8_t	namelen;	/* length of name string */
-	__uint8_t	pad2;
-} xfs_dir_leaf_entry_t;
-
-typedef struct xfs_dir_leaf_name {
-	xfs_dir_ino_t	inumber;	/* inode number for this key */
-	__uint8_t	name[1];	/* name string itself */
-} xfs_dir_leaf_name_t;
-
-typedef struct xfs_dir_leafblock {
-	xfs_dir_leaf_hdr_t	hdr;	/* constant-structure header block */
-	xfs_dir_leaf_entry_t	entries[1];	/* var sized array */
-	xfs_dir_leaf_name_t	namelist[1];	/* grows from bottom of buf */
-} xfs_dir_leafblock_t;
-
-/*
- * Length of name for which a 512-byte block filesystem
- * can get a double split.
- */
-#define	XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN	\
-	(512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
-	 (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
-	 (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
-
-typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
-
-typedef union {
-	xfs_off_t		o;		/* offset (cookie) */
-	/*
-	 * Watch the order here (endian-ness dependent).
-	 */
-	struct {
-#ifndef XFS_NATIVE_HOST
-		xfs_dahash_t	h;	/* hash value */
-		__uint32_t	be;	/* block and entry */
-#else
-		__uint32_t	be;	/* block and entry */
-		xfs_dahash_t	h;	/* hash value */
-#endif /* XFS_NATIVE_HOST */
-	} s;
-} xfs_dircook_t;
-
-#define	XFS_PUT_COOKIE(c,mp,bno,entry,hash)	\
-	((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
-
-typedef struct xfs_dir_put_args {
-	xfs_dircook_t	cook;		/* cookie of (next) entry */
-	xfs_intino_t	ino;		/* inode number */
-	struct xfs_dirent *dbp;		/* buffer pointer */
-	char		*name;		/* directory entry name */
-	int		namelen;	/* length of name */
-	int		done;		/* output: set if value was stored */
-	xfs_dir_put_t	put;		/* put function ptr (i/o) */
-	struct uio	*uio;		/* uio control structure */
-} xfs_dir_put_args_t;
-
-#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	\
-	xfs_dir_leaf_entsize_byname(len)
-static inline int xfs_dir_leaf_entsize_byname(int len)
-{
-	return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len;
-}
-
-#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)	\
-	xfs_dir_leaf_entsize_byentry(entry)
-static inline int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
-{
-	return (uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen;
-}
-
-#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)	\
-	xfs_dir_leaf_namestruct(leafp,offset)
-static inline xfs_dir_leaf_name_t *
-xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
-{
-	return (xfs_dir_leaf_name_t *)&((char *)(leafp))[offset];
-}
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Internal routines when dirsize < XFS_LITINO(mp).
- */
-int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
-int xfs_dir_shortform_addname(struct xfs_da_args *args);
-int xfs_dir_shortform_lookup(struct xfs_da_args *args);
-int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
-int xfs_dir_shortform_removename(struct xfs_da_args *args);
-int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
-			       struct xfs_dirent *dbp, xfs_dir_put_t put);
-int xfs_dir_shortform_replace(struct xfs_da_args *args);
-
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-int xfs_dir_leaf_to_node(struct xfs_da_args *args);
-int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
-
-/*
- * Routines used for growing the Btree.
- */
-int	xfs_dir_leaf_split(struct xfs_da_state *state,
-				  struct xfs_da_state_blk *oldblk,
-				  struct xfs_da_state_blk *newblk);
-int	xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
-				struct xfs_da_args *args, int insertion_index);
-int	xfs_dir_leaf_addname(struct xfs_da_args *args);
-int	xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
-				       struct xfs_da_args *args,
-				       int *index_found_at);
-int	xfs_dir_leaf_remove(struct xfs_trans *trans,
-				   struct xfs_dabuf *leaf_buffer,
-				   int index_to_remove);
-int	xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
-					 xfs_dablk_t bno, struct uio *uio,
-					 int *eobp, struct xfs_dirent *dbp,
-					 xfs_dir_put_t put, xfs_daddr_t nextda);
-
-/*
- * Routines used for shrinking the Btree.
- */
-int	xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void	xfs_dir_leaf_unbalance(struct xfs_da_state *state,
-					     struct xfs_da_state_blk *drop_blk,
-					     struct xfs_da_state_blk *save_blk);
-
-/*
- * Utility routines.
- */
-uint	xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
-int	xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
-				  struct xfs_dabuf *leaf2_bp);
-int	xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
-int	xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
-int	xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-
-/*
- * Global data.
- */
-extern xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-#endif /* __XFS_DIR_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
deleted file mode 100644
index 5b20b4d3f57d..000000000000
--- a/fs/xfs/xfs_dir_sf.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_SF_H__
-#define	__XFS_DIR_SF_H__
-
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to
- * fit into the literal area of the inode.
- */
-
-typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
-
-/*
- * The parent directory has a dedicated field, and the self-pointer must
- * be calculated on the fly.
- *
- * Entries are packed toward the top as tight as possible.  The header
- * and the elements much be memcpy'd out into a work area to get correct
- * alignment for the inode number fields.
- */
-typedef struct xfs_dir_sf_hdr {		/* constant-structure header block */
-	xfs_dir_ino_t	parent;		/* parent dir inode number */
-	__uint8_t	count;		/* count of active entries */
-} xfs_dir_sf_hdr_t;
-
-typedef struct xfs_dir_sf_entry {
-	xfs_dir_ino_t	inumber;	/* referenced inode number */
-	__uint8_t	namelen;	/* actual length of name (no NULL) */
-	__uint8_t	name[1];	/* name */
-} xfs_dir_sf_entry_t;
-
-typedef struct xfs_dir_shortform {
-	xfs_dir_sf_hdr_t	hdr;
-	xfs_dir_sf_entry_t	list[1];	/* variable sized array */
-} xfs_dir_shortform_t;
-
-/*
- * We generate this then sort it, so that readdirs are returned in
- * hash-order.  Else seekdir won't work.
- */
-typedef struct xfs_dir_sf_sort {
-	__uint8_t	entno;		/* .=0, ..=1, else entry# + 2 */
-	__uint8_t	seqno;		/* sequence # with same hash value */
-	__uint8_t	namelen;	/* length of name value (no null) */
-	xfs_dahash_t	hash;		/* this entry's hash value */
-	xfs_intino_t	ino;		/* this entry's inode number */
-	char		*name;		/* name value, pointer into buffer */
-} xfs_dir_sf_sort_t;
-
-#define	XFS_DIR_SF_GET_DIRINO(from,to)	xfs_dir_sf_get_dirino(from, to)
-static inline void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
-{
-	*(to) = XFS_GET_DIR_INO8(*from);
-}
-
-#define	XFS_DIR_SF_PUT_DIRINO(from,to)	xfs_dir_sf_put_dirino(from, to)
-static inline void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
-{
-	XFS_PUT_DIR_INO8(*(from), *(to));
-}
-
-#define XFS_DIR_SF_ENTSIZE_BYNAME(len)	xfs_dir_sf_entsize_byname(len)
-static inline int xfs_dir_sf_entsize_byname(int len)
-{
-	return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (len);
-}
-
-#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)	xfs_dir_sf_entsize_byentry(sfep)
-static inline int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
-{
-	return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen;
-}
-
-#define XFS_DIR_SF_NEXTENTRY(sfep)		xfs_dir_sf_nextentry(sfep)
-static inline xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
-{
-	return (xfs_dir_sf_entry_t *) \
-		((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep));
-}
-
-#define XFS_DIR_SF_ALLFIT(count,totallen)	\
-	xfs_dir_sf_allfit(count,totallen)
-static inline int xfs_dir_sf_allfit(int count, int totallen)
-{
-	return ((uint)sizeof(xfs_dir_sf_hdr_t) + \
-	       ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen));
-}
-
-#if defined(XFS_DIR_TRACE)
-
-/*
- * Kernel tracing support for directories.
- */
-struct uio;
-struct xfs_inode;
-struct xfs_da_intnode;
-struct xfs_dinode;
-struct xfs_dir_leafblock;
-struct xfs_dir_leaf_entry;
-
-#define	XFS_DIR_TRACE_SIZE	4096	/* size of global trace buffer */
-extern ktrace_t	*xfs_dir_trace_buf;
-
-/*
- * Trace record types.
- */
-#define	XFS_DIR_KTRACE_G_DU	1	/* dp, uio */
-#define	XFS_DIR_KTRACE_G_DUB	2	/* dp, uio, bno */
-#define	XFS_DIR_KTRACE_G_DUN	3	/* dp, uio, node */
-#define	XFS_DIR_KTRACE_G_DUL	4	/* dp, uio, leaf */
-#define	XFS_DIR_KTRACE_G_DUE	5	/* dp, uio, leaf entry */
-#define	XFS_DIR_KTRACE_G_DUC	6	/* dp, uio, cookie */
-
-void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
-void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
-			      xfs_dablk_t bno);
-void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
-			      struct xfs_da_intnode *node);
-void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
-			      struct xfs_dir_leafblock *leaf);
-void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
-			      struct xfs_dir_leaf_entry *entry);
-void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
-			      xfs_off_t cookie);
-void xfs_dir_trace_enter(int type, char *where,
-			     void *a0, void *a1, void *a2, void *a3,
-			     void *a4, void *a5, void *a6, void *a7,
-			     void *a8, void *a9, void *a10, void *a11);
-#else
-#define	xfs_dir_trace_g_du(w,d,u)
-#define	xfs_dir_trace_g_dub(w,d,u,b)
-#define	xfs_dir_trace_g_dun(w,d,u,n)
-#define	xfs_dir_trace_g_dul(w,d,u,l)
-#define	xfs_dir_trace_g_due(w,d,u,e)
-#define	xfs_dir_trace_g_duc(w,d,u,c)
-#endif /* DEBUG */
-
-#endif	/* __XFS_DIR_SF_H__ */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 00b1540f8108..4e7865ad6f0e 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -189,6 +189,6 @@ typedef enum {
 #define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
 
 
-extern struct bhv_vfsops xfs_dmops;
+extern struct bhv_module_vfsops xfs_dmops;
 
 #endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index 629795b3b3d5..1e4a35ddf7f9 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 2a21c5024017..b95681b03d81 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -22,12 +22,10 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f19282ec8549..6cf6d8769b97 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
@@ -294,6 +293,62 @@ xfs_efi_init(xfs_mount_t	*mp,
 }
 
 /*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
+ */
+int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+{
+	xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr;
+	uint i;
+	uint len = sizeof(xfs_efi_log_format_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
+	uint len32 = sizeof(xfs_efi_log_format_32_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);  
+	uint len64 = sizeof(xfs_efi_log_format_64_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);  
+
+	if (buf->i_len == len) {
+		memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+		return 0;
+	} else if (buf->i_len == len32) {
+		xfs_efi_log_format_32_t *src_efi_fmt_32 =
+			(xfs_efi_log_format_32_t *)buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_32->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_32->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_32->efi_extents[i].ext_len;
+		}
+		return 0;
+	} else if (buf->i_len == len64) {
+		xfs_efi_log_format_64_t *src_efi_fmt_64 =
+			(xfs_efi_log_format_64_t *)buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_64->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_64->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_64->efi_extents[i].ext_len;
+		}
+		return 0;
+	}
+	return EFSCORRUPTED;
+}
+
+/*
  * This is called by the efd item code below to release references to
  * the given efi item.  Each efd calls this with the number of
  * extents that it has logged, and when the sum of these reaches
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 5bf681708fec..0ea45edaab03 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -27,6 +27,24 @@ typedef struct xfs_extent {
 } xfs_extent_t;
 
 /*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+
+typedef struct xfs_extent_32 {
+	xfs_dfsbno_t	ext_start;
+	xfs_extlen_t	ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+	xfs_dfsbno_t	ext_start;
+	xfs_extlen_t	ext_len;
+	__uint32_t	ext_pad;
+} xfs_extent_64_t;
+
+/*
  * This is the structure used to lay out an efi log item in the
  * log.  The efi_extents field is a variable size array whose
  * size is given by efi_nextents.
@@ -39,6 +57,22 @@ typedef struct xfs_efi_log_format {
 	xfs_extent_t		efi_extents[1];	/* array of extents to free */
 } xfs_efi_log_format_t;
 
+typedef struct xfs_efi_log_format_32 {
+	unsigned short		efi_type;	/* efi log item type */
+	unsigned short		efi_size;	/* size of this item */
+	uint			efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_32_t		efi_extents[1];	/* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+	unsigned short		efi_type;	/* efi log item type */
+	unsigned short		efi_size;	/* size of this item */
+	uint			efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_64_t		efi_extents[1];	/* array of extents to free */
+} xfs_efi_log_format_64_t;
+
 /*
  * This is the structure used to lay out an efd log item in the
  * log.  The efd_extents array is a variable size array whose
@@ -52,6 +86,22 @@ typedef struct xfs_efd_log_format {
 	xfs_extent_t		efd_extents[1];	/* array of extents freed */
 } xfs_efd_log_format_t;
 
+typedef struct xfs_efd_log_format_32 {
+	unsigned short		efd_type;	/* efd log item type */
+	unsigned short		efd_size;	/* size of this item */
+	uint			efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_32_t		efd_extents[1];	/* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+	unsigned short		efd_type;	/* efd log item type */
+	unsigned short		efd_size;	/* size of this item */
+	uint			efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_64_t		efd_extents[1];	/* array of extents freed */
+} xfs_efd_log_format_64_t;
+
 
 #ifdef __KERNEL__
 
@@ -103,7 +153,8 @@ extern struct kmem_zone	*xfs_efd_zone;
 xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
 xfs_efd_log_item_t	*xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
 				      uint);
-
+int			xfs_efi_copy_format(xfs_log_iovec_t *buf,
+					    xfs_efi_log_format_t *dst_efi_fmt);
 void			xfs_efi_item_free(xfs_efi_log_item_t *);
 
 #endif	/* __KERNEL__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 14010f1fa82f..0f0ad1535951 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -67,14 +67,15 @@ struct fsxattr {
 #define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
 #define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
 #define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
+#define XFS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
 #define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /*
  * Structure for XFS_IOC_GETBMAP.
  * On input, fill in bmv_offset and bmv_length of the first structure
- * to indicate the area of interest in the file, and bmv_entry with the
- * number of array elements given.  The first structure is updated on
- * return to give the offset and length for the next call.
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back.  The first structure is
+ * updated on return to give the offset and length for the next call.
  */
 #ifndef HAVE_GETBMAP
 struct getbmap {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dfa3527b20a7..077629bab532 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -542,14 +540,13 @@ xfs_reserve_blocks(
 }
 
 void
-xfs_fs_log_dummy(xfs_mount_t *mp)
+xfs_fs_log_dummy(
+	xfs_mount_t	*mp)
 {
-	xfs_trans_t *tp;
-	xfs_inode_t *ip;
-
+	xfs_trans_t	*tp;
+	xfs_inode_t	*ip;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-	atomic_inc(&mp->m_active_trans);
 	if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
 		xfs_trans_cancel(tp, 0);
 		return;
@@ -574,21 +571,22 @@ xfs_fs_goingdown(
 {
 	switch (inflags) {
 	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
-		struct vfs *vfsp = XFS_MTOVFS(mp);
+		struct bhv_vfs *vfsp = XFS_MTOVFS(mp);
 		struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
 
 		if (sb && !IS_ERR(sb)) {
-			xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 			thaw_bdev(sb->s_bdev, sb);
 		}
 	
 		break;
 	}
 	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
-		xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 		break;
 	case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
-		xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp,
+				SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
 		break;
 	default:
 		return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index deddbd03c166..33164a85aa9d 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1174,6 +1172,9 @@ xfs_dilocate(
 	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
 	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
+		/* no diagnostics for bulkstat, ino comes from userspace */
+		if (flags & XFS_IMAP_BULKSTAT)
+			return XFS_ERROR(EINVAL);
 		if (agno >= mp->m_sb.sb_agcount) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 					"xfs_dilocate: agno (%d) >= "
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 60c65683462d..616eeeb6953e 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b53854325266..0724df7fabb7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -186,7 +184,7 @@ xfs_ihash_promote(
  */
 STATIC int
 xfs_iget_core(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
 	xfs_ino_t	ino,
@@ -198,7 +196,7 @@ xfs_iget_core(
 	xfs_ihash_t	*ih;
 	xfs_inode_t	*ip;
 	xfs_inode_t	*iq;
-	vnode_t		*inode_vp;
+	bhv_vnode_t	*inode_vp;
 	ulong		version;
 	int		error;
 	/* REFERENCED */
@@ -468,7 +466,7 @@ finish_inode:
 	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
 	 */
-	VFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
+	bhv_vfs_init_vnode(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
 
 	return 0;
 }
@@ -489,7 +487,7 @@ xfs_iget(
 	xfs_daddr_t	bno)
 {
 	struct inode	*inode;
-	vnode_t		*vp = NULL;
+	bhv_vnode_t	*vp = NULL;
 	int		error;
 
 	XFS_STATS_INC(xs_ig_attempts);
@@ -543,7 +541,7 @@ retry:
 void
 xfs_inode_lock_init(
 	xfs_inode_t	*ip,
-	vnode_t		*vp)
+	bhv_vnode_t	*vp)
 {
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", (long)vp->v_number);
@@ -603,12 +601,10 @@ void
 xfs_iput(xfs_inode_t	*ip,
 	 uint		lock_flags)
 {
-	vnode_t	*vp = XFS_ITOV(ip);
+	bhv_vnode_t	*vp = XFS_ITOV(ip);
 
 	vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
-
 	xfs_iunlock(ip, lock_flags);
-
 	VN_RELE(vp);
 }
 
@@ -619,7 +615,7 @@ void
 xfs_iput_new(xfs_inode_t	*ip,
 	     uint		lock_flags)
 {
-	vnode_t		*vp = XFS_ITOV(ip);
+	bhv_vnode_t	*vp = XFS_ITOV(ip);
 	struct inode	*inode = vn_to_inode(vp);
 
 	vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
@@ -645,7 +641,7 @@ xfs_iput_new(xfs_inode_t	*ip,
 void
 xfs_ireclaim(xfs_inode_t *ip)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	/*
 	 * Remove from old hash list and mount list.
@@ -1033,6 +1029,6 @@ xfs_iflock_nowait(xfs_inode_t *ip)
 void
 xfs_ifunlock(xfs_inode_t *ip)
 {
-	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	vsema(&(ip->i_flock));
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 94b60dd03801..5fa0adb7e173 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -26,14 +26,12 @@
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -256,13 +254,11 @@ xfs_itobp(
 	xfs_daddr_t	bno,
 	uint		imap_flags)
 {
+	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
 	int		error;
-	xfs_imap_t	imap;
-#ifdef __KERNEL__
 	int		i;
 	int		ni;
-#endif
 
 	if (ip->i_blkno == (xfs_daddr_t)0) {
 		/*
@@ -319,7 +315,6 @@ xfs_itobp(
 	 */
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
 				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-
 	if (error) {
 #ifdef DEBUG
 		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
@@ -330,17 +325,21 @@ xfs_itobp(
 #endif /* DEBUG */
 		return error;
 	}
-#ifdef __KERNEL__
+
 	/*
 	 * Validate the magic number and version of every inode in the buffer
 	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+	 * No validation is done here in userspace (xfs_repair).
 	 */
-#ifdef DEBUG
+#if !defined(__KERNEL__)
+	ni = 0;
+#elif defined(DEBUG)
 	ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 :
 		(BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog);
-#else
+#else	/* usual case */
 	ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1;
 #endif
+
 	for (i = 0; i < ni; i++) {
 		int		di_ok;
 		xfs_dinode_t	*dip;
@@ -352,8 +351,11 @@ xfs_itobp(
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
 				 XFS_RANDOM_ITOBP_INOTOBP))) {
 #ifdef DEBUG
-			prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
-				mp->m_ddev_targp,
+			if (!(imap_flags & XFS_IMAP_BULKSTAT))
+				cmn_err(CE_ALERT,
+					"Device %s - bad inode magic/vsn "
+					"daddr %lld #%d (magic=%x)",
+				XFS_BUFTARG_NAME(mp->m_ddev_targp),
 				(unsigned long long)imap.im_blkno, i,
 				INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
 #endif
@@ -363,7 +365,6 @@ xfs_itobp(
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 	}
-#endif	/* __KERNEL__ */
 
 	xfs_inobp_check(mp, bp);
 
@@ -782,7 +783,6 @@ xfs_xlate_dinode_core(
 
 STATIC uint
 _xfs_dic2xflags(
-	xfs_dinode_core_t	*dic,
 	__uint16_t		di_flags)
 {
 	uint			flags = 0;
@@ -812,6 +812,8 @@ _xfs_dic2xflags(
 			flags |= XFS_XFLAG_EXTSIZE;
 		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 			flags |= XFS_XFLAG_EXTSZINHERIT;
+		if (di_flags & XFS_DIFLAG_NODEFRAG)
+			flags |= XFS_XFLAG_NODEFRAG;
 	}
 
 	return flags;
@@ -823,16 +825,16 @@ xfs_ip2xflags(
 {
 	xfs_dinode_core_t	*dic = &ip->i_d;
 
-	return _xfs_dic2xflags(dic, dic->di_flags) |
-		(XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(dic->di_flags) |
+				(XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
 }
 
 uint
 xfs_dic2xflags(
 	xfs_dinode_core_t	*dic)
 {
-	return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) |
-		(XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(INT_GET(dic->di_flags, ARCH_CONVERT)) |
+				(XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
 }
 
 /*
@@ -1083,7 +1085,7 @@ xfs_ialloc(
 {
 	xfs_ino_t	ino;
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	uint		flags;
 	int		error;
 
@@ -1221,6 +1223,9 @@ xfs_ialloc(
 				di_flags |= XFS_DIFLAG_NOSYMLINKS;
 			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 				di_flags |= XFS_DIFLAG_PROJINHERIT;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+			    xfs_inherit_nodefrag)
+				di_flags |= XFS_DIFLAG_NODEFRAG;
 			ip->i_d.di_flags |= di_flags;
 		}
 		/* FALLTHROUGH */
@@ -1244,8 +1249,8 @@ xfs_ialloc(
 	 */
 	xfs_trans_log_inode(tp, ip, flags);
 
-	/* now that we have an i_mode  we can set Linux inode ops (& unlock) */
-	VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
+	/* now that we have an i_mode we can setup inode ops and unlock */
+	bhv_vfs_init_vnode(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
 
 	*ipp = ip;
 	return 0;
@@ -1285,7 +1290,7 @@ xfs_isize_check(
 				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
 			  map_first),
 			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-			 NULL))
+			 NULL, NULL))
 	    return;
 	ASSERT(nimaps == 1);
 	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1421,7 +1426,7 @@ xfs_itruncate_start(
 	xfs_fsize_t	last_byte;
 	xfs_off_t	toss_start;
 	xfs_mount_t	*mp;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
 	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
@@ -1434,9 +1439,9 @@ xfs_itruncate_start(
 	vn_iowait(vp);  /* wait for the completion of any pending DIOs */
 	
 	/*
-	 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
+	 * Call toss_pages or flushinval_pages to get rid of pages
 	 * overlapping the region being removed.  We have to use
-	 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
+	 * the less efficient flushinval_pages in the case that the
 	 * caller may not be able to finish the truncate without
 	 * dropping the inode's I/O lock.  Make sure
 	 * to catch any pages brought in by buffers overlapping
@@ -1445,10 +1450,10 @@ xfs_itruncate_start(
 	 * so that we don't toss things on the same block as
 	 * new_size but before it.
 	 *
-	 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
+	 * Before calling toss_page or flushinval_pages, make sure to
 	 * call remapf() over the same region if the file is mapped.
 	 * This frees up mapped file references to the pages in the
-	 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
+	 * given range and for the flushinval_pages case it ensures
 	 * that we get the latest mapped changes flushed out.
 	 */
 	toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
@@ -1466,9 +1471,9 @@ xfs_itruncate_start(
 			 last_byte);
 	if (last_byte > toss_start) {
 		if (flags & XFS_ITRUNC_DEFINITE) {
-			VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+			bhv_vop_toss_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
 		} else {
-			VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+			bhv_vop_flushinval_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
 		}
 	}
 
@@ -1666,12 +1671,13 @@ xfs_itruncate_finish(
 		 * runs.
 		 */
 		XFS_BMAP_INIT(&free_list, &first_block);
-		error = xfs_bunmapi(ntp, ip, first_unmap_block,
-				    unmap_len,
+		error = XFS_BUNMAPI(mp, ntp, &ip->i_iocore,
+				    first_unmap_block, unmap_len,
 				    XFS_BMAPI_AFLAG(fork) |
 				      (sync ? 0 : XFS_BMAPI_ASYNC),
 				    XFS_ITRUNC_MAX_EXTENTS,
-				    &first_block, &free_list, &done);
+				    &first_block, &free_list,
+				    NULL, &done);
 		if (error) {
 			/*
 			 * If the bunmapi call encounters an error,
@@ -2745,13 +2751,14 @@ xfs_iunpin(
 		 * the inode to become unpinned.
 		 */
 		if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
-			vnode_t	*vp = XFS_ITOV_NULL(ip);
+			bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
 
 			/* make sync come back and flush this inode */
 			if (vp) {
 				struct inode	*inode = vn_to_inode(vp);
 
-				if (!(inode->i_state & I_NEW))
+				if (!(inode->i_state &
+						(I_NEW|I_FREEING|I_CLEAR)))
 					mark_inode_dirty_sync(inode);
 			}
 		}
@@ -2916,13 +2923,6 @@ xfs_iflush_fork(
 			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
 		}
-		if (whichfork == XFS_DATA_FORK) {
-			if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
-				XFS_ERROR_REPORT("xfs_iflush_fork",
-						 XFS_ERRLEVEL_LOW, mp);
-				return XFS_ERROR(EFSCORRUPTED);
-			}
-		}
 		break;
 
 	case XFS_DINODE_FMT_EXTENTS:
@@ -3006,7 +3006,7 @@ xfs_iflush(
 	XFS_STATS_INC(xs_iflush_count);
 
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
-	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
@@ -3199,7 +3199,7 @@ xfs_iflush(
 
 corrupt_out:
 	xfs_buf_relse(bp);
-	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	xfs_iflush_abort(ip);
 	/*
 	 * Unlocks the flush lock
@@ -3221,7 +3221,7 @@ cluster_corrupt_out:
 		xfs_buf_relse(bp);
 	}
 
-	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 
 	if(!bufwasdelwri)  {
 		/*
@@ -3264,7 +3264,7 @@ xfs_iflush_int(
 	SPLDECL(s);
 
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
-	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
@@ -3504,7 +3504,7 @@ xfs_iflush_all(
 	xfs_mount_t	*mp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
  again:
 	XFS_MOUNT_ILOCK(mp);
@@ -4180,7 +4180,7 @@ xfs_iext_direct_to_inline(
 	 */
 	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
 		nextents * sizeof(xfs_bmbt_rec_t));
-	kmem_free(ifp->if_u1.if_extents, KM_SLEEP);
+	kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
 	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 	ifp->if_real_bytes = 0;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3b544db1790b..d10b76ed1e5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -102,9 +102,9 @@ typedef struct xfs_ifork {
 
 #ifdef __KERNEL__
 struct bhv_desc;
+struct bhv_vnode;
 struct cred;
 struct ktrace;
-struct vnode;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -400,7 +400,7 @@ void		xfs_chash_init(struct xfs_mount *);
 void		xfs_chash_free(struct xfs_mount *);
 xfs_inode_t	*xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
 				  struct xfs_trans *);
-void            xfs_inode_lock_init(xfs_inode_t *, struct vnode *);
+void            xfs_inode_lock_init(xfs_inode_t *, struct bhv_vnode *);
 int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			 uint, uint, xfs_inode_t **, xfs_daddr_t);
 void		xfs_iput(xfs_inode_t *, uint);
@@ -461,7 +461,7 @@ void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, int, uint);
 
-xfs_inode_t	*xfs_vtoi(struct vnode *vp);
+xfs_inode_t	*xfs_vtoi(struct bhv_vnode *vp);
 
 void		xfs_synchronize_atime(xfs_inode_t *);
 
@@ -509,7 +509,6 @@ extern struct kmem_zone	*xfs_chashlist_zone;
 extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_inode_zone;
 extern struct kmem_zone	*xfs_ili_zone;
-extern struct vnodeops	xfs_vnodeops;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7497a481b2f5..f8e80d8e7237 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -25,7 +25,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -794,7 +792,7 @@ xfs_inode_item_pushbuf(
 	 * inode flush completed and the inode was taken off the AIL.
 	 * So, just get out.
 	 */
-	if ((valusema(&(ip->i_flock)) > 0)  ||
+	if (!issemalocked(&(ip->i_flock)) ||
 	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		iip->ili_pushbuf_flag = 0;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -816,7 +814,7 @@ xfs_inode_item_pushbuf(
 			 * If not, we can flush it async.
 			 */
 			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-				  (valusema(&(ip->i_flock)) <= 0));
+				  issemalocked(&(ip->i_flock)));
 			iip->ili_pushbuf_flag = 0;
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			xfs_buftrace("INODE ITEM PUSH", bp);
@@ -864,7 +862,7 @@ xfs_inode_item_push(
 	ip = iip->ili_inode;
 
 	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
-	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	/*
 	 * Since we were able to lock the inode's flush lock and
 	 * we found it on the AIL, the inode must be dirty.  This
@@ -1084,3 +1082,52 @@ xfs_istale_done(
 {
 	xfs_iflush_abort(iip->ili_inode);
 }
+
+/*
+ * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
+ * (which can have different field alignments) to the native version
+ */
+int
+xfs_inode_item_format_convert(
+	xfs_log_iovec_t		*buf,
+	xfs_inode_log_format_t	*in_f)
+{
+	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
+		xfs_inode_log_format_32_t *in_f32;
+
+		in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
+		in_f->ilf_type = in_f32->ilf_type;
+		in_f->ilf_size = in_f32->ilf_size;
+		in_f->ilf_fields = in_f32->ilf_fields;
+		in_f->ilf_asize = in_f32->ilf_asize;
+		in_f->ilf_dsize = in_f32->ilf_dsize;
+		in_f->ilf_ino = in_f32->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f32->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f32->ilf_blkno;
+		in_f->ilf_len = in_f32->ilf_len;
+		in_f->ilf_boffset = in_f32->ilf_boffset;
+		return 0;
+	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
+		xfs_inode_log_format_64_t *in_f64;
+
+		in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
+		in_f->ilf_type = in_f64->ilf_type;
+		in_f->ilf_size = in_f64->ilf_size;
+		in_f->ilf_fields = in_f64->ilf_fields;
+		in_f->ilf_asize = in_f64->ilf_asize;
+		in_f->ilf_dsize = in_f64->ilf_dsize;
+		in_f->ilf_ino = in_f64->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f64->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f64->ilf_blkno;
+		in_f->ilf_len = in_f64->ilf_len;
+		in_f->ilf_boffset = in_f64->ilf_boffset;
+		return 0;
+	}
+	return EFSCORRUPTED;
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index c5dbf93b6661..5db6cd1b4cf3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -23,25 +23,6 @@
  * log.  The size of the inline data/extents/b-tree root to be logged
  * (if any) is indicated in the ilf_dsize field.  Changes to this structure
  * must be added on to the end.
- *
- * Convention for naming inode log item versions :  The current version
- * is always named XFS_LI_INODE.  When an inode log item gets superseded,
- * add the latest version of IRIX that will generate logs with that item
- * to the version name.
- *
- * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first
- *	union (ilf_u) field.  This was released with IRIX 5.3-XFS.
- * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire
- *	structure.  This was released with IRIX 6.0.1-XFS and IRIX 6.1.
- * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2
- *	so a new structure definition wasn't necessary.  However, we had
- *	to add a new type because the inode cluster size changed from 4K
- *	to 8K and the version number had to be rev'ved to keep older kernels
- *	from trying to recover logs with the 8K buffers in them.  The logging
- *	code can handle recovery on different-sized clusters now so hopefully
- *	this'll be the last time we need to change the inode log item just
- *	for a change in the inode cluster size.  This new version was
- *	released with IRIX 6.2.
  */
 typedef struct xfs_inode_log_format {
 	unsigned short		ilf_type;	/* inode log item type */
@@ -59,18 +40,38 @@ typedef struct xfs_inode_log_format {
 	int			ilf_boffset;	/* off of inode in buffer */
 } xfs_inode_log_format_t;
 
-/* Initial version shipped with IRIX 5.3-XFS */
-typedef struct xfs_inode_log_format_v1 {
-	unsigned short		ilf_type;	/* inode log item type */
-	unsigned short		ilf_size;	/* size of this item */
-	uint			ilf_fields;	/* flags for fields logged */
-	uint			ilf_dsize;	/* size of data/ext/root */
-	xfs_ino_t		ilf_ino;	/* inode number */
+typedef struct xfs_inode_log_format_32 {
+	unsigned short		ilf_type;	/* 16: inode log item type */
+	unsigned short		ilf_size;	/* 16: size of this item */
+	uint			ilf_fields;	/* 32: flags for fields logged */
+	ushort			ilf_asize;	/* 32: size of attr d/ext/root */
+	ushort			ilf_dsize;	/* 32: size of data/ext/root */
+	xfs_ino_t		ilf_ino;	/* 64: inode number */
 	union {
-		xfs_dev_t	ilfu_rdev;	/* rdev value for dev inode*/
-		uuid_t		ilfu_uuid;	/* mount point value */
+		xfs_dev_t	ilfu_rdev;	/* 32: rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* 128: mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* 64: blkno of inode buffer */
+	int			ilf_len;	/* 32: len of inode buffer */
+	int			ilf_boffset;	/* 32: off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+	unsigned short		ilf_type;	/* 16: inode log item type */
+	unsigned short		ilf_size;	/* 16: size of this item */
+	uint			ilf_fields;	/* 32: flags for fields logged */
+	ushort			ilf_asize;	/* 32: size of attr d/ext/root */
+	ushort			ilf_dsize;	/* 32: size of data/ext/root */
+	__uint32_t		ilf_pad;	/* 32: pad for 64 bit boundary */
+	xfs_ino_t		ilf_ino;	/* 64: inode number */
+	union {
+		xfs_dev_t	ilfu_rdev;	/* 32: rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* 128: mount point value */
 	} ilf_u;
-} xfs_inode_log_format_t_v1;
+	__int64_t		ilf_blkno;	/* 64: blkno of inode buffer */
+	int			ilf_len;	/* 32: len of inode buffer */
+	int			ilf_boffset;	/* 32: off of inode in buffer */
+} xfs_inode_log_format_64_t;
 
 /*
  * Flags for xfs_trans_log_inode flags field.
@@ -172,6 +173,8 @@ extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
 extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
 extern void xfs_iflush_abort(struct xfs_inode *);
+extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
+					 xfs_inode_log_format_t *);
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
index a07815661a8c..06d710c9ce4b 100644
--- a/fs/xfs/xfs_iocore.c
+++ b/fs/xfs/xfs_iocore.c
@@ -24,14 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
+#include "xfs_dfrag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -58,7 +57,7 @@ xfs_size_fn(
 
 STATIC int
 xfs_ioinit(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	struct xfs_mount_args	*mntargs,
 	int			flags)
 {
@@ -68,6 +67,7 @@ xfs_ioinit(
 xfs_ioops_t	xfs_iocore_xfs = {
 	.xfs_ioinit		= (xfs_ioinit_t) xfs_ioinit,
 	.xfs_bmapi_func		= (xfs_bmapi_t) xfs_bmapi,
+	.xfs_bunmapi_func	= (xfs_bunmapi_t) xfs_bunmapi,
 	.xfs_bmap_eof_func	= (xfs_bmap_eof_t) xfs_bmap_eof,
 	.xfs_iomap_write_direct =
 			(xfs_iomap_write_direct_t) xfs_iomap_write_direct,
@@ -84,6 +84,7 @@ xfs_ioops_t	xfs_iocore_xfs = {
 	.xfs_unlock		= (xfs_unlk_t) xfs_iunlock,
 	.xfs_size_func		= (xfs_size_t) xfs_size_fn,
 	.xfs_iodone		= (xfs_iodone_t) fs_noerr,
+	.xfs_swap_extents_func	= (xfs_swap_extents_t) xfs_swap_extents,
 };
 
 void
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d5dfedcb8922..f1949c16df15 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -252,7 +250,7 @@ xfs_iomap(
 	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
 			(xfs_filblks_t)(end_fsb - offset_fsb),
 			bmapi_flags,  NULL, 0, &imap,
-			&nimaps, NULL);
+			&nimaps, NULL, NULL);
 
 	if (error)
 		goto out;
@@ -519,8 +517,8 @@ xfs_iomap_write_direct(
 	 */
 	XFS_BMAP_INIT(&free_list, &firstfsb);
 	nimaps = 1;
-	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-		bmapi_flag, &firstfsb, 0, &imap, &nimaps, &free_list);
+	error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, bmapi_flag,
+		&firstfsb, 0, &imap, &nimaps, &free_list, NULL);
 	if (error)
 		goto error0;
 
@@ -610,8 +608,8 @@ xfs_iomap_eof_want_preallocate(
 	while (count_fsb > 0) {
 		imaps = nimaps;
 		firstblock = NULLFSBLOCK;
-		error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
-				  0, &firstblock, 0, imap, &imaps, NULL);
+		error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, 0,
+				  &firstblock, 0, imap, &imaps, NULL, NULL);
 		if (error)
 			return error;
 		for (n = 0; n < imaps; n++) {
@@ -695,11 +693,11 @@ retry:
 
 	nimaps = XFS_WRITE_IMAPS;
 	firstblock = NULLFSBLOCK;
-	error = xfs_bmapi(NULL, ip, offset_fsb,
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
 			  (xfs_filblks_t)(last_fsb - offset_fsb),
 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-			  &nimaps, NULL);
+			  &nimaps, NULL, NULL);
 	if (error && (error != ENOSPC))
 		return XFS_ERROR(error);
 
@@ -832,9 +830,9 @@ xfs_iomap_write_allocate(
 			}
 
 			/* Go get the actual blocks */
-			error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+			error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb,
 					XFS_BMAPI_WRITE, &first_block, 1,
-					imap, &nimaps, &free_list);
+					imap, &nimaps, &free_list, NULL);
 			if (error)
 				goto trans_cancel;
 
@@ -955,9 +953,9 @@ xfs_iomap_write_unwritten(
 		 */
 		XFS_BMAP_INIT(&free_list, &firstfsb);
 		nimaps = 1;
-		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+		error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb,
 				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
-				  1, &imap, &nimaps, &free_list);
+				  1, &imap, &nimaps, &free_list, NULL);
 		if (error)
 			goto error_on_bmapi_transaction;
 
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 94068d014f27..46249e4d1fea 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -41,11 +39,6 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 
-#ifndef HAVE_USERACC
-#define useracc(ubuffer, size, flags, foo) (0)
-#define unuseracc(ubuffer, size, flags)
-#endif
-
 STATIC int
 xfs_bulkstat_one_iget(
 	xfs_mount_t	*mp,		/* mount point for filesystem */
@@ -56,7 +49,7 @@ xfs_bulkstat_one_iget(
 {
 	xfs_dinode_core_t *dic;		/* dinode core info pointer */
 	xfs_inode_t	*ip;		/* incore inode pointer */
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	int		error;
 
 	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
@@ -336,15 +329,6 @@ xfs_bulkstat(
 	nimask = ~(nicluster - 1);
 	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
 	/*
-	 * Lock down the user's buffer. If a buffer was not sent, as in the case
-	 * disk quota code calls here, we skip this.
-	 */
-	if (ubuffer &&
-	    (error = useracc(ubuffer, ubcount * statstruct_size,
-			(B_READ|B_PHYS), NULL))) {
-		return error;
-	}
-	/*
 	 * Allocate a page-sized buffer for inode btree records.
 	 * We could try allocating something smaller, but for normal
 	 * calls we'll always (potentially) need the whole page.
@@ -650,8 +634,6 @@ xfs_bulkstat(
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
 	kmem_free(irbuf, NBPC);
-	if (ubuffer)
-		unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS));
 	*ubcountp = ubelem;
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 11eb4e1b18c4..be5f12e07d22 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -45,7 +45,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
  */
 #define	BULKSTAT_FG_IGET	0x1	/* Go through the buffer cache */
 #define	BULKSTAT_FG_QUICK	0x2	/* No iget, walk the dinode cluster */
-#define BULKSTAT_FG_VFSLOCKED	0x4	/* Already have vfs lock */
 
 /*
  * Return stat information in bulk (by-inode) for the filesystem.
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 32e841d2f26d..d8f5d4cbe8b7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -36,7 +35,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
 #include "xfs_trans_priv.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -402,7 +400,7 @@ xfs_log_release_iclog(xfs_mount_t *mp,
 	xlog_in_core_t	  *iclog = (xlog_in_core_t *)iclog_hndl;
 
 	if (xlog_state_release_iclog(log, iclog)) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 		return EIO;
 	}
 
@@ -498,9 +496,8 @@ xfs_log_mount(xfs_mount_t	*mp,
 	 * just worked.
 	 */
 	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
-		int	error;
-		vfs_t	*vfsp = XFS_MTOVFS(mp);
-		int	readonly = (vfsp->vfs_flag & VFS_RDONLY);
+		bhv_vfs_t	*vfsp = XFS_MTOVFS(mp);
+		int		error, readonly = (vfsp->vfs_flag & VFS_RDONLY);
 
 		if (readonly)
 			vfsp->vfs_flag &= ~VFS_RDONLY;
@@ -726,7 +723,7 @@ xfs_log_write(xfs_mount_t *	mp,
 		return XFS_ERROR(EIO);
 
 	if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 	}
 	return error;
 }	/* xfs_log_write */
@@ -816,9 +813,9 @@ xfs_log_need_covered(xfs_mount_t *mp)
 	SPLDECL(s);
 	int		needed = 0, gen;
 	xlog_t		*log = mp->m_log;
-	vfs_t		*vfsp = XFS_MTOVFS(mp);
+	bhv_vfs_t	*vfsp = XFS_MTOVFS(mp);
 
-	if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
+	if (vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
 	    (vfsp->vfs_flag & VFS_RDONLY))
 		return 0;
 
@@ -956,7 +953,7 @@ xlog_iodone(xfs_buf_t *bp)
 			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
 		xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
 		XFS_BUF_STALE(bp);
-		xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
 		/*
 		 * This flag will be propagated to the trans-committed
 		 * callback routines to let them know that the log-commit
@@ -1261,7 +1258,7 @@ xlog_commit_record(xfs_mount_t  *mp,
 	ASSERT_ALWAYS(iclog);
 	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
 			       iclog, XLOG_COMMIT_TRANS))) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 	}
 	return error;
 }	/* xlog_commit_record */
@@ -1790,7 +1787,7 @@ xlog_write(xfs_mount_t *	mp,
 	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
 		"xfs_log_write: reservation ran out. Need to up reservation");
 	/* If we did not panic, shutdown the filesystem */
-	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 #endif
     } else
 	ticket->t_curr_res -= len;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1f0016b0b4ec..55b4237c2153 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -193,14 +191,14 @@ xlog_header_check_dump(
 {
 	int			b;
 
-	printk("%s:  SB : uuid = ", __FUNCTION__);
+	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
 	for (b = 0; b < 16; b++)
-		printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
-	printk(", fmt = %d\n", XLOG_FMT);
-	printk("    log : uuid = ");
+		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
+	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
+	cmn_err(CE_DEBUG, "    log : uuid = ");
 	for (b = 0; b < 16; b++)
-		printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
-	printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
+		cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
+	cmn_err(CE_DEBUG, ", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
 }
 #else
 #define xlog_header_check_dump(mp, head)
@@ -282,7 +280,7 @@ xlog_recover_iodone(
 		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
 		xfs_ioerror_alert("xlog_recover_iodone",
 				  mp, bp, XFS_BUF_ADDR(bp));
-		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	}
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -1889,7 +1887,7 @@ xlog_recover_do_inode_buffer(
 
 		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
 					      next_unlinked_offset);
-		INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
+		*buffer_nextp = *logged_nextp;
 	}
 
 	return 0;
@@ -2292,12 +2290,22 @@ xlog_recover_do_inode_trans(
 	int			attr_index;
 	uint			fields;
 	xfs_dinode_core_t	*dicp;
+	int			need_free = 0;
 
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 
-	in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+		in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+	} else {
+		in_f = (xfs_inode_log_format_t *)kmem_alloc(
+			sizeof(xfs_inode_log_format_t), KM_SLEEP);
+		need_free = 1;
+		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+		if (error)
+			goto error;
+	}
 	ino = in_f->ilf_ino;
 	mp = log->l_mp;
 	if (ITEM_TYPE(item) == XFS_LI_INODE) {
@@ -2323,8 +2331,10 @@ xlog_recover_do_inode_trans(
 	 * Inode buffers can be freed, look out for it,
 	 * and do not replay the inode.
 	 */
-	if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0))
-		return 0;
+	if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
+		error = 0;
+		goto error;
+	}
 
 	bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
 								XFS_BUF_LOCK);
@@ -2333,7 +2343,7 @@ xlog_recover_do_inode_trans(
 				  bp, imap.im_blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
-		return error;
+		goto error;
 	}
 	error = 0;
 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
@@ -2350,7 +2360,8 @@ xlog_recover_do_inode_trans(
 			dip, bp, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
 				 XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 	dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
@@ -2360,7 +2371,8 @@ xlog_recover_do_inode_trans(
 			item, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
 				 XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 
 	/* Skip replay when the on disk inode is newer than the log one */
@@ -2376,7 +2388,8 @@ xlog_recover_do_inode_trans(
 			/* do nothing */
 		} else {
 			xfs_buf_relse(bp);
-			return 0;
+			error = 0;
+			goto error;
 		}
 	}
 	/* Take the opportunity to reset the flush iteration count */
@@ -2391,7 +2404,8 @@ xlog_recover_do_inode_trans(
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
-			return XFS_ERROR(EFSCORRUPTED);
+			error = EFSCORRUPTED;
+			goto error;
 		}
 	} else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2403,7 +2417,8 @@ xlog_recover_do_inode_trans(
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
-			return XFS_ERROR(EFSCORRUPTED);
+			error = EFSCORRUPTED;
+			goto error;
 		}
 	}
 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
@@ -2415,7 +2430,8 @@ xlog_recover_do_inode_trans(
 			item, dip, bp, ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
@@ -2424,7 +2440,8 @@ xlog_recover_do_inode_trans(
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
 			item, dip, bp, ino, dicp->di_forkoff);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 	if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
@@ -2433,7 +2450,8 @@ xlog_recover_do_inode_trans(
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
 			item->ri_buf[1].i_len, item);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 
 	/* The core is in in-core format */
@@ -2521,7 +2539,8 @@ xlog_recover_do_inode_trans(
 			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
 			ASSERT(0);
 			xfs_buf_relse(bp);
-			return XFS_ERROR(EIO);
+			error = EIO;
+			goto error;
 		}
 	}
 
@@ -2537,7 +2556,10 @@ write_inode_buffer:
 		error = xfs_bwrite(mp, bp);
 	}
 
-	return (error);
+error:
+	if (need_free)
+		kmem_free(in_f, sizeof(*in_f));
+	return XFS_ERROR(error);
 }
 
 /*
@@ -2674,32 +2696,32 @@ xlog_recover_do_dquot_trans(
  * structure into it, and adds the efi to the AIL with the given
  * LSN.
  */
-STATIC void
+STATIC int
 xlog_recover_do_efi_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	xfs_lsn_t		lsn,
 	int			pass)
 {
+	int			error;
 	xfs_mount_t		*mp;
 	xfs_efi_log_item_t	*efip;
 	xfs_efi_log_format_t	*efi_formatp;
 	SPLDECL(s);
 
 	if (pass == XLOG_RECOVER_PASS1) {
-		return;
+		return 0;
 	}
 
 	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
-	ASSERT(item->ri_buf[0].i_len ==
-	       (sizeof(xfs_efi_log_format_t) +
-		((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
 
 	mp = log->l_mp;
 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
-	memcpy((char *)&(efip->efi_format), (char *)efi_formatp,
-	      sizeof(xfs_efi_log_format_t) +
-	      ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
+	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
+					 &(efip->efi_format)))) {
+		xfs_efi_item_free(efip);
+		return error;
+	}
 	efip->efi_next_extent = efi_formatp->efi_nextents;
 	efip->efi_flags |= XFS_EFI_COMMITTED;
 
@@ -2708,6 +2730,7 @@ xlog_recover_do_efi_trans(
 	 * xfs_trans_update_ail() drops the AIL lock.
 	 */
 	xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
+	return 0;
 }
 
 
@@ -2738,9 +2761,10 @@ xlog_recover_do_efd_trans(
 	}
 
 	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
-	ASSERT(item->ri_buf[0].i_len ==
-	       (sizeof(xfs_efd_log_format_t) +
-		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
+	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
+	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
 	efi_id = efd_formatp->efd_efi_id;
 
 	/*
@@ -2810,15 +2834,14 @@ xlog_recover_do_trans(
 			if  ((error = xlog_recover_do_buffer_trans(log, item,
 								 pass)))
 				break;
-		} else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
-			   (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
-			   (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
+		} else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
 			if ((error = xlog_recover_do_inode_trans(log, item,
 								pass)))
 				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
-			xlog_recover_do_efi_trans(log, item, trans->r_lsn,
-						  pass);
+			if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
+						  pass)))
+				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
 			xlog_recover_do_efd_trans(log, item, pass);
 		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
@@ -3419,13 +3442,13 @@ xlog_unpack_data_checksum(
 	    if (rhead->h_chksum ||
 		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
 		    cmn_err(CE_DEBUG,
-			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
+			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
 			    INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
 		    cmn_err(CE_DEBUG,
 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
 		    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
 			    cmn_err(CE_DEBUG,
-				"XFS: LogR this is a LogV2 filesystem");
+				"XFS: LogR this is a LogV2 filesystem\n");
 		    }
 		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
 	    }
@@ -3798,7 +3821,7 @@ xlog_do_log_recovery(
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS2);
 #ifdef DEBUG
-	{
+	if (!error) {
 		int	i;
 
 		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
@@ -3974,7 +3997,7 @@ xlog_recover_finish(
 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
 	} else {
 		cmn_err(CE_DEBUG,
-			"!Ending clean XFS mount for filesystem: %s",
+			"!Ending clean XFS mount for filesystem: %s\n",
 			log->l_mp->m_fsname);
 	}
 	return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c0b1c2906880..10dbf203c62f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -196,7 +194,7 @@ xfs_mount_free(
 		kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
 
 	if (remove_bhv) {
-		struct vfs	*vfsp = XFS_MTOVFS(mp);
+		struct bhv_vfs	*vfsp = XFS_MTOVFS(mp);
 
 		bhv_remove_all_vfsops(vfsp, 0);
 		VFS_REMOVEBHV(vfsp, &mp->m_bhv);
@@ -337,7 +335,7 @@ xfs_mount_validate_sb(
 
 xfs_agnumber_t
 xfs_initialize_perag(
-	struct vfs	*vfs,
+	bhv_vfs_t	*vfs,
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agcount)
 {
@@ -651,14 +649,14 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
  */
 int
 xfs_mountfs(
-	vfs_t		*vfsp,
+	bhv_vfs_t	*vfsp,
 	xfs_mount_t	*mp,
 	int		mfsi_flags)
 {
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
-	vnode_t		*rvp = NULL;
+	bhv_vnode_t	*rvp = NULL;
 	int		readio_log, writeio_log;
 	xfs_daddr_t	d;
 	__uint64_t	ret64;
@@ -934,18 +932,7 @@ xfs_mountfs(
 	vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid;
 	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
 
-	/*
-	 * Select the right directory manager.
-	 */
-	mp->m_dirops =
-		XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
-			xfsv2_dirops :
-			xfsv1_dirops;
-
-	/*
-	 * Initialize directory manager's entries.
-	 */
-	XFS_DIR_MOUNT(mp);
+	xfs_dir_mount(mp);
 
 	/*
 	 * Initialize the attribute manager's entries.
@@ -1006,8 +993,9 @@ xfs_mountfs(
 
 	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
 		cmn_err(CE_WARN, "XFS: corrupted root inode");
-		prdev("Root inode %llu is not a directory",
-		      mp->m_ddev_targp, (unsigned long long)rip->i_ino);
+		cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
+			XFS_BUFTARG_NAME(mp->m_ddev_targp),
+			(unsigned long long)rip->i_ino);
 		xfs_iunlock(rip, XFS_ILOCK_EXCL);
 		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
 				 mp);
@@ -1094,7 +1082,7 @@ xfs_mountfs(
 int
 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 {
-	struct vfs	*vfsp = XFS_MTOVFS(mp);
+	struct bhv_vfs	*vfsp = XFS_MTOVFS(mp);
 #if defined(DEBUG) || defined(INDUCE_IO_ERROR)
 	int64_t		fsid;
 #endif
@@ -1254,6 +1242,26 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 
 	xfs_trans_log_buf(tp, bp, first, last);
 }
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
+*/
+#define SET_ASIDE_BLOCKS 8
+
 /*
  * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
  * a delta to a specified field in the in-core superblock.  Simply
@@ -1298,7 +1306,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 		return 0;
 	case XFS_SBS_FDBLOCKS:
 
-		lcounter = (long long)mp->m_sb.sb_fdblocks;
+		lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
 		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
 
 		if (delta > 0) {		/* Putting blocks back */
@@ -1332,7 +1340,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 			}
 		}
 
-		mp->m_sb.sb_fdblocks = lcounter;
+		mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
 		return 0;
 	case XFS_SBS_FREXTENTS:
 		lcounter = (long long)mp->m_sb.sb_frextents;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 668ad23fd37c..b2bd4be4200a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,8 +53,8 @@ typedef struct xfs_trans_reservations {
 #else
 struct cred;
 struct log;
-struct vfs;
-struct vnode;
+struct bhv_vfs;
+struct bhv_vnode;
 struct xfs_mount_args;
 struct xfs_ihash;
 struct xfs_chash;
@@ -63,9 +63,11 @@ struct xfs_perag;
 struct xfs_iocore;
 struct xfs_bmbt_irec;
 struct xfs_bmap_free;
+struct xfs_extdelta;
+struct xfs_swapext;
 
-extern struct vfsops xfs_vfsops;
-extern struct vnodeops xfs_vnodeops;
+extern struct bhv_vfsops xfs_vfsops;
+extern struct bhv_vnodeops xfs_vnodeops;
 
 #define	AIL_LOCK_T		lock_t
 #define	AIL_LOCKINIT(x,y)	spinlock_init(x,y)
@@ -78,15 +80,15 @@ extern struct vnodeops xfs_vnodeops;
  * Prototypes and functions for the Data Migration subsystem.
  */
 
-typedef int	(*xfs_send_data_t)(int, struct vnode *,
-			xfs_off_t, size_t, int, vrwlock_t *);
+typedef int	(*xfs_send_data_t)(int, struct bhv_vnode *,
+			xfs_off_t, size_t, int, bhv_vrwlock_t *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int	(*xfs_send_destroy_t)(struct vnode *, dm_right_t);
-typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct vfs *,
-			struct vnode *,
-			dm_right_t, struct vnode *, dm_right_t,
+typedef int	(*xfs_send_destroy_t)(struct bhv_vnode *, dm_right_t);
+typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct bhv_vfs *,
+			struct bhv_vnode *,
+			dm_right_t, struct bhv_vnode *, dm_right_t,
 			char *, char *, mode_t, int, int);
-typedef void	(*xfs_send_unmount_t)(struct vfs *, struct vnode *,
+typedef void	(*xfs_send_unmount_t)(struct bhv_vfs *, struct bhv_vnode *,
 			dm_right_t, mode_t, int, int);
 
 typedef struct xfs_dmops {
@@ -188,13 +190,18 @@ typedef struct xfs_qmops {
  * Prototypes and functions for I/O core modularization.
  */
 
-typedef int		(*xfs_ioinit_t)(struct vfs *,
+typedef int		(*xfs_ioinit_t)(struct bhv_vfs *,
 				struct xfs_mount_args *, int);
 typedef int		(*xfs_bmapi_t)(struct xfs_trans *, void *,
 				xfs_fileoff_t, xfs_filblks_t, int,
 				xfs_fsblock_t *, xfs_extlen_t,
 				struct xfs_bmbt_irec *, int *,
-				struct xfs_bmap_free *);
+				struct xfs_bmap_free *, struct xfs_extdelta *);
+typedef int		(*xfs_bunmapi_t)(struct xfs_trans *,
+				void *, xfs_fileoff_t,
+				xfs_filblks_t, int, xfs_extnum_t,
+				xfs_fsblock_t *, struct xfs_bmap_free *,
+				struct xfs_extdelta *, int *);
 typedef int		(*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *);
 typedef int		(*xfs_iomap_write_direct_t)(
 				void *, xfs_off_t, size_t, int,
@@ -213,11 +220,14 @@ typedef void		(*xfs_lock_demote_t)(void *, uint);
 typedef int		(*xfs_lock_nowait_t)(void *, uint);
 typedef void		(*xfs_unlk_t)(void *, unsigned int);
 typedef xfs_fsize_t	(*xfs_size_t)(void *);
-typedef xfs_fsize_t	(*xfs_iodone_t)(struct vfs *);
+typedef xfs_fsize_t	(*xfs_iodone_t)(struct bhv_vfs *);
+typedef int		(*xfs_swap_extents_t)(void *, void *,
+				struct xfs_swapext*);
 
 typedef struct xfs_ioops {
 	xfs_ioinit_t			xfs_ioinit;
 	xfs_bmapi_t			xfs_bmapi_func;
+	xfs_bunmapi_t			xfs_bunmapi_func;
 	xfs_bmap_eof_t			xfs_bmap_eof_func;
 	xfs_iomap_write_direct_t	xfs_iomap_write_direct;
 	xfs_iomap_write_delay_t		xfs_iomap_write_delay;
@@ -230,13 +240,17 @@ typedef struct xfs_ioops {
 	xfs_unlk_t			xfs_unlock;
 	xfs_size_t			xfs_size_func;
 	xfs_iodone_t			xfs_iodone;
+	xfs_swap_extents_t		xfs_swap_extents_func;
 } xfs_ioops_t;
 
 #define XFS_IOINIT(vfsp, args, flags) \
 	(*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags)
-#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist)	\
+#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist,delta) \
 	(*(mp)->m_io_ops.xfs_bmapi_func) \
-		(trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist)
+		(trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist,delta)
+#define XFS_BUNMAPI(mp, trans,io,bno,len,f,nexts,first,flist,delta,done) \
+	(*(mp)->m_io_ops.xfs_bunmapi_func) \
+		(trans,(io)->io_obj,bno,len,f,nexts,first,flist,delta,done)
 #define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \
 	(*(mp)->m_io_ops.xfs_bmap_eof_func) \
 		((io)->io_obj, endoff, whichfork, eof)
@@ -266,6 +280,9 @@ typedef struct xfs_ioops {
 	(*(mp)->m_io_ops.xfs_size_func)((io)->io_obj)
 #define XFS_IODONE(vfsp) \
 	(*(mp)->m_io_ops.xfs_iodone)(vfsp)
+#define XFS_SWAP_EXTENTS(mp, io, tio, sxp) \
+	(*(mp)->m_io_ops.xfs_swap_extents_func) \
+		((io)->io_obj, (tio)->io_obj, sxp)
 
 #ifdef HAVE_PERCPU_SB
 
@@ -386,8 +403,6 @@ typedef struct xfs_mount {
 	__uint8_t		m_inode_quiesce;/* call quiesce on new inodes.
 						   field governed by m_ilock */
 	__uint8_t		m_sectbb_log;	/* sectlog - BBSHIFT */
-	__uint8_t		m_dirversion;	/* 1 or 2 */
-	xfs_dirops_t		m_dirops;	/* table of dir funcs */
 	int			m_dirblksize;	/* directory block sz--bytes */
 	int			m_dirblkfsbs;	/* directory block sz--fsbs */
 	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
@@ -494,16 +509,7 @@ xfs_preferred_iosize(xfs_mount_t *mp)
 
 #define XFS_FORCED_SHUTDOWN(mp)	((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
 #define xfs_force_shutdown(m,f)	\
-	VFS_FORCE_SHUTDOWN((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
-
-/*
- * Flags sent to xfs_force_shutdown.
- */
-#define XFS_METADATA_IO_ERROR	0x1
-#define XFS_LOG_IO_ERROR	0x2
-#define XFS_FORCE_UMOUNT	0x4
-#define XFS_CORRUPT_INCORE	0x8	/* Corrupt in-memory data structures */
-#define XFS_SHUTDOWN_REMOTE_REQ 0x10	/* Shutdown came from remote cell */
+	bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
 
 /*
  * Flags for xfs_mountfs
@@ -521,7 +527,7 @@ xfs_preferred_iosize(xfs_mount_t *mp)
  * Macros for getting from mount to vfs and back.
  */
 #define	XFS_MTOVFS(mp)		xfs_mtovfs(mp)
-static inline struct vfs *xfs_mtovfs(xfs_mount_t *mp)
+static inline struct bhv_vfs *xfs_mtovfs(xfs_mount_t *mp)
 {
 	return bhvtovfs(&mp->m_bhv);
 }
@@ -533,7 +539,7 @@ static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp)
 }
 
 #define XFS_VFSTOM(vfs) xfs_vfstom(vfs)
-static inline xfs_mount_t *xfs_vfstom(vfs_t *vfs)
+static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs)
 {
 	return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops));
 }
@@ -571,7 +577,7 @@ typedef struct xfs_mod_sb {
 extern xfs_mount_t *xfs_mount_init(void);
 extern void	xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern void	xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
-extern int	xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
+extern int	xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int);
 extern void	xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
 extern int	xfs_unmountfs(xfs_mount_t *, struct cred *);
@@ -589,7 +595,7 @@ extern void	xfs_freesb(xfs_mount_t *);
 extern void	xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
 extern int	xfs_syncsub(xfs_mount_t *, int, int, int *);
 extern int	xfs_sync_inodes(xfs_mount_t *, int, int, int *);
-extern xfs_agnumber_t	xfs_initialize_perag(struct vfs *, xfs_mount_t *,
+extern xfs_agnumber_t	xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *,
 						xfs_agnumber_t);
 extern void	xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t);
 
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 1408a32eef88..320d63ff9ca2 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 7fbef974bce6..acb853b33ebb 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -365,7 +365,7 @@ typedef struct xfs_dqtrxops {
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
-extern struct bhv_vfsops xfs_qmops;
+extern struct bhv_module_vfsops xfs_qmops;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 1f148762eb28..d98171deaa1c 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -22,13 +22,11 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -40,7 +38,6 @@
 #include "xfs_refcache.h"
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
-#include "xfs_dir_leaf.h"
 
 
 /*
@@ -87,8 +84,8 @@ STATIC int
 xfs_lock_for_rename(
 	xfs_inode_t	*dp1,	/* old (source) directory inode */
 	xfs_inode_t	*dp2,	/* new (target) directory inode */
-	vname_t		*vname1,/* old entry name */
-	vname_t		*vname2,/* new entry name */
+	bhv_vname_t	*vname1,/* old entry name */
+	bhv_vname_t	*vname2,/* new entry name */
 	xfs_inode_t	**ipp1,	/* inode of old entry */
 	xfs_inode_t	**ipp2,	/* inode of new entry, if it
 				   already exists, NULL otherwise. */
@@ -225,9 +222,9 @@ xfs_lock_for_rename(
 int
 xfs_rename(
 	bhv_desc_t	*src_dir_bdp,
-	vname_t		*src_vname,
-	vnode_t		*target_dir_vp,
-	vname_t		*target_vname,
+	bhv_vname_t	*src_vname,
+	bhv_vnode_t	*target_dir_vp,
+	bhv_vname_t	*target_vname,
 	cred_t		*credp)
 {
 	xfs_trans_t	*tp;
@@ -242,7 +239,7 @@ xfs_rename(
 	int		committed;
 	xfs_inode_t	*inodes[4];
 	int		target_ip_dropped = 0;	/* dropped target_ip link? */
-	vnode_t		*src_dir_vp;
+	bhv_vnode_t	*src_dir_vp;
 	int		spaceres;
 	int		target_link_zero = 0;
 	int		num_inodes;
@@ -398,34 +395,29 @@ xfs_rename(
 		 * fit before actually inserting it.
 		 */
 		if (spaceres == 0 &&
-		    (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name,
-				target_namelen))) {
+		    (error = xfs_dir_canenter(tp, target_dp, target_name,
+						target_namelen)))
 			goto error_return;
-		}
 		/*
 		 * If target does not exist and the rename crosses
 		 * directories, adjust the target directory link count
 		 * to account for the ".." reference from the new entry.
 		 */
-		error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name,
+		error = xfs_dir_createname(tp, target_dp, target_name,
 					   target_namelen, src_ip->i_ino,
 					   &first_block, &free_list, spaceres);
-		if (error == ENOSPC) {
+		if (error == ENOSPC)
 			goto error_return;
-		}
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		if (new_parent && src_is_directory) {
 			error = xfs_bumplink(tp, target_dp);
-			if (error) {
+			if (error)
 				goto abort_return;
-			}
 		}
 	} else { /* target_ip != NULL */
-
 		/*
 		 * If target exists and it's a directory, check that both
 		 * target and source are directories and that target can be
@@ -435,7 +427,7 @@ xfs_rename(
 			/*
 			 * Make sure target dir is empty.
 			 */
-			if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) ||
+			if (!(xfs_dir_isempty(target_ip)) ||
 			    (target_ip->i_d.di_nlink > 2)) {
 				error = XFS_ERROR(EEXIST);
 				goto error_return;
@@ -451,12 +443,11 @@ xfs_rename(
 		 * In case there is already an entry with the same
 		 * name at the destination directory, remove it first.
 		 */
-		error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name,
-			target_namelen, src_ip->i_ino, &first_block,
-			&free_list, spaceres);
-		if (error) {
+		error = xfs_dir_replace(tp, target_dp, target_name,
+					target_namelen, src_ip->i_ino,
+					&first_block, &free_list, spaceres);
+		if (error)
 			goto abort_return;
-		}
 		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		/*
@@ -464,9 +455,8 @@ xfs_rename(
 		 * dir no longer points to it.
 		 */
 		error = xfs_droplink(tp, target_ip);
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 		target_ip_dropped = 1;
 
 		if (src_is_directory) {
@@ -474,9 +464,8 @@ xfs_rename(
 			 * Drop the link from the old "." entry.
 			 */
 			error = xfs_droplink(tp, target_ip);
-			if (error) {
+			if (error)
 				goto abort_return;
-			}
 		}
 
 		/* Do this test while we still hold the locks */
@@ -488,18 +477,15 @@ xfs_rename(
 	 * Remove the source.
 	 */
 	if (new_parent && src_is_directory) {
-
 		/*
 		 * Rewrite the ".." entry to point to the new
 		 * directory.
 		 */
-		error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2,
-					target_dp->i_ino, &first_block,
-					&free_list, spaceres);
+		error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
+					&first_block, &free_list, spaceres);
 		ASSERT(error != EEXIST);
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 		xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 	} else {
@@ -527,16 +513,14 @@ xfs_rename(
 		 * entry that's moved no longer points to it.
 		 */
 		error = xfs_droplink(tp, src_dp);
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 	}
 
-	error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen,
+	error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
 			src_ip->i_ino, &first_block, &free_list, spaceres);
-	if (error) {
+	if (error)
 		goto abort_return;
-	}
 	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 	/*
@@ -609,7 +593,7 @@ xfs_rename(
 	 * Let interposed file systems know about removed links.
 	 */
 	if (target_ip_dropped) {
-		VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp,
+		bhv_vop_link_removed(XFS_ITOV(target_ip), target_dir_vp,
 					target_link_zero);
 		IRELE(target_ip);
 	}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5b413946b1c5..0c1e42b037ef 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -141,7 +139,7 @@ xfs_growfs_rt_alloc(
 		cancelflags |= XFS_TRANS_ABORT;
 		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
 			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-			resblks, &map, &nmap, &flist);
+			resblks, &map, &nmap, &flist, NULL);
 		if (!error && nmap < 1)
 			error = XFS_ERROR(ENOSPC);
 		if (error)
@@ -2404,10 +2402,10 @@ xfs_rtprint_range(
 {
 	xfs_extlen_t	i;		/* block number in the extent */
 
-	printk("%Ld: ", (long long)start);
+	cmn_err(CE_DEBUG, "%Ld: ", (long long)start);
 	for (i = 0; i < len; i++)
-		printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
-	printk("\n");
+		cmn_err(CE_DEBUG, "%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
+	cmn_err(CE_DEBUG, "\n");
 }
 
 /*
@@ -2431,17 +2429,17 @@ xfs_rtprint_summary(
 			(void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c);
 			if (c) {
 				if (!p) {
-					printk("%Ld-%Ld:", 1LL << l,
+					cmn_err(CE_DEBUG, "%Ld-%Ld:", 1LL << l,
 						XFS_RTMIN((1LL << l) +
 							  ((1LL << l) - 1LL),
 							 mp->m_sb.sb_rextents));
 					p = 1;
 				}
-				printk(" %Ld:%d", (long long)i, c);
+				cmn_err(CE_DEBUG, " %Ld:%d", (long long)i, c);
 			}
 		}
 		if (p)
-			printk("\n");
+			cmn_err(CE_DEBUG, "\n");
 	}
 	if (sumbp)
 		xfs_trans_brelse(tp, sumbp);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index a59c102cf214..defb2febaaf5 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -92,6 +90,90 @@ xfs_write_clear_setuid(
 }
 
 /*
+ * Handle logging requirements of various synchronous types of write.
+ */
+int
+xfs_write_sync_logforce(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip)
+{
+	int		error = 0;
+
+	/*
+	 * If we're treating this as O_DSYNC and we have not updated the
+	 * size, force the log.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
+	    !(ip->i_update_size)) {
+		xfs_inode_log_item_t	*iip = ip->i_itemp;
+
+		/*
+		 * If an allocation transaction occurred
+		 * without extending the size, then we have to force
+		 * the log up the proper point to ensure that the
+		 * allocation is permanent.  We can't count on
+		 * the fact that buffered writes lock out direct I/O
+		 * writes - the direct I/O write could have extended
+		 * the size nontransactionally, then finished before
+		 * we started.  xfs_write_file will think that the file
+		 * didn't grow but the update isn't safe unless the
+		 * size change is logged.
+		 *
+		 * Force the log if we've committed a transaction
+		 * against the inode or if someone else has and
+		 * the commit record hasn't gone to disk (e.g.
+		 * the inode is pinned).  This guarantees that
+		 * all changes affecting the inode are permanent
+		 * when we return.
+		 */
+		if (iip && iip->ili_last_lsn) {
+			xfs_log_force(mp, iip->ili_last_lsn,
+					XFS_LOG_FORCE | XFS_LOG_SYNC);
+		} else if (xfs_ipincount(ip) > 0) {
+			xfs_log_force(mp, (xfs_lsn_t)0,
+					XFS_LOG_FORCE | XFS_LOG_SYNC);
+		}
+
+	} else {
+		xfs_trans_t	*tp;
+
+		/*
+		 * O_SYNC or O_DSYNC _with_ a size update are handled
+		 * the same way.
+		 *
+		 * If the write was synchronous then we need to make
+		 * sure that the inode modification time is permanent.
+		 * We'll have updated the timestamp above, so here
+		 * we use a synchronous transaction to log the inode.
+		 * It's not fast, but it's necessary.
+		 *
+		 * If this a dsync write and the size got changed
+		 * non-transactionally, then we need to ensure that
+		 * the size change gets logged in a synchronous
+		 * transaction.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+		if ((error = xfs_trans_reserve(tp, 0,
+						XFS_SWRITE_LOG_RES(mp),
+						0, 0, 0))) {
+			/* Transaction reserve failed */
+			xfs_trans_cancel(tp, 0);
+		} else {
+			/* Transaction reserve successful */
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(tp, ip);
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			xfs_trans_set_sync(tp);
+			error = xfs_trans_commit(tp, 0, NULL);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+	}
+
+	return error;
+}
+
+/*
  * Force a shutdown of the filesystem instantly while keeping
  * the filesystem consistent. We don't do an unmount here; just shutdown
  * the shop, make sure that absolutely nothing persistent happens to
@@ -109,12 +191,12 @@ xfs_do_force_shutdown(
 	xfs_mount_t	*mp;
 
 	mp = XFS_BHVTOM(bdp);
-	logerror = flags & XFS_LOG_IO_ERROR;
+	logerror = flags & SHUTDOWN_LOG_IO_ERROR;
 
-	if (!(flags & XFS_FORCE_UMOUNT)) {
-		cmn_err(CE_NOTE,
-		"xfs_force_shutdown(%s,0x%x) called from line %d of file %s.  Return address = 0x%p",
-			mp->m_fsname,flags,lnnum,fname,__return_address);
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
+				 "line %d of file %s.  Return address = 0x%p",
+			mp->m_fsname, flags, lnnum, fname, __return_address);
 	}
 	/*
 	 * No need to duplicate efforts.
@@ -125,33 +207,37 @@ xfs_do_force_shutdown(
 	/*
 	 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
 	 * queue up anybody new on the log reservations, and wakes up
-	 * everybody who's sleeping on log reservations and tells
-	 * them the bad news.
+	 * everybody who's sleeping on log reservations to tell them
+	 * the bad news.
 	 */
 	if (xfs_log_force_umount(mp, logerror))
 		return;
 
-	if (flags & XFS_CORRUPT_INCORE) {
+	if (flags & SHUTDOWN_CORRUPT_INCORE) {
 		xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
     "Corruption of in-memory data detected.  Shutting down filesystem: %s",
 			mp->m_fsname);
 		if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
 			xfs_stack_trace();
 		}
-	} else if (!(flags & XFS_FORCE_UMOUNT)) {
+	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
 		if (logerror) {
 			xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
-			"Log I/O Error Detected.  Shutting down filesystem: %s",
+		"Log I/O Error Detected.  Shutting down filesystem: %s",
+				mp->m_fsname);
+		} else if (flags & SHUTDOWN_DEVICE_REQ) {
+			xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+		"All device paths lost.  Shutting down filesystem: %s",
 				mp->m_fsname);
-		} else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
+		} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
 			xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
-				"I/O Error Detected.  Shutting down filesystem: %s",
+		"I/O Error Detected.  Shutting down filesystem: %s",
 				mp->m_fsname);
 		}
 	}
-	if (!(flags & XFS_FORCE_UMOUNT)) {
-		cmn_err(CE_ALERT,
-		"Please umount the filesystem, and rectify the problem(s)");
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		cmn_err(CE_ALERT, "Please umount the filesystem, "
+				  "and rectify the problem(s)");
 	}
 }
 
@@ -335,7 +421,7 @@ xfs_bwrite(
 		 * from bwrite and we could be tracing a buffer that has
 		 * been reused.
 		 */
-		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	}
 	return (error);
 }
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index e63795644478..188b296ff50c 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -75,6 +75,7 @@ xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb)
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
+extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -87,9 +88,10 @@ extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
 /*
  * Prototypes for functions in xfs_vnodeops.c.
  */
-extern int xfs_rwlock(bhv_desc_t *bdp, vrwlock_t write_lock);
-extern void xfs_rwunlock(bhv_desc_t *bdp, vrwlock_t write_lock);
-extern int xfs_setattr(bhv_desc_t *bdp, vattr_t *vap, int flags, cred_t *credp);
+extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
+extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
+extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags,
+		       cred_t *credp);
 extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf,
 				 xfs_off_t offset, cred_t *credp, int flags);
 extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8d056cef5d1f..ee2721e0de4d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -236,11 +234,8 @@ xfs_trans_alloc(
 	xfs_mount_t	*mp,
 	uint		type)
 {
-	fs_check_frozen(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
-	atomic_inc(&mp->m_active_trans);
-
-	return (_xfs_trans_alloc(mp, type));
-
+	vfs_wait_for_freeze(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
+	return _xfs_trans_alloc(mp, type);
 }
 
 xfs_trans_t *
@@ -250,12 +245,9 @@ _xfs_trans_alloc(
 {
 	xfs_trans_t	*tp;
 
-	ASSERT(xfs_trans_zone != NULL);
-	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	atomic_inc(&mp->m_active_trans);
 
-	/*
-	 * Initialize the transaction structure.
-	 */
+	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
 	tp->t_magic = XFS_TRANS_MAGIC;
 	tp->t_type = type;
 	tp->t_mountp = mp;
@@ -263,8 +255,7 @@ _xfs_trans_alloc(
 	tp->t_busy_free = XFS_LBC_NUM_SLOTS;
 	XFS_LIC_INIT(&(tp->t_items));
 	XFS_LBC_INIT(&(tp->t_busy));
-
-	return (tp);
+	return tp;
 }
 
 /*
@@ -303,7 +294,7 @@ xfs_trans_dup(
 	tp->t_blk_res = tp->t_blk_res_used;
 	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
 	tp->t_rtx_res = tp->t_rtx_res_used;
-	PFLAGS_DUP(&tp->t_pflags, &ntp->t_pflags);
+	ntp->t_pflags = tp->t_pflags;
 
 	XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
 
@@ -335,14 +326,11 @@ xfs_trans_reserve(
 	uint		logcount)
 {
 	int		log_flags;
-	int		error;
-	int	rsvd;
-
-	error = 0;
-	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+	int		error = 0;
+	int		rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
 	/* Mark this thread as being in a transaction */
-        PFLAGS_SET_FSTRANS(&tp->t_pflags);
+	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
 	/*
 	 * Attempt to reserve the needed disk blocks by decrementing
@@ -353,7 +341,7 @@ xfs_trans_reserve(
 		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
 					  -blocks, rsvd);
 		if (error != 0) {
-                        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 			return (XFS_ERROR(ENOSPC));
 		}
 		tp->t_blk_res += blocks;
@@ -426,9 +414,9 @@ undo_blocks:
 		tp->t_blk_res = 0;
 	}
 
-        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-	return (error);
+	return error;
 }
 
 
@@ -819,7 +807,7 @@ shut_us_down:
 			if (commit_lsn == -1 && !shutdown)
 				shutdown = XFS_ERROR(EIO);
 		}
-                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+		current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 		xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
 		xfs_trans_free_busy(tp);
 		xfs_trans_free(tp);
@@ -846,7 +834,7 @@ shut_us_down:
 	 */
 	nvec = xfs_trans_count_vecs(tp);
 	if (nvec == 0) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 		goto shut_us_down;
 	} else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
 		log_vector = log_vector_fast;
@@ -884,7 +872,7 @@ shut_us_down:
 	 * had pinned, clean up, free trans structure, and return error.
 	 */
 	if (error || commit_lsn == -1) {
-                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+		current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 		xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
 		return XFS_ERROR(EIO);
 	}
@@ -926,7 +914,7 @@ shut_us_down:
 	/*
 	 * Mark this thread as no longer being in a transaction
 	 */
-	PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
 	/*
 	 * Once all the items of the transaction have been copied
@@ -1148,7 +1136,7 @@ xfs_trans_cancel(
 	 */
 	if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
 		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
-		xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	}
 #ifdef DEBUG
 	if (!(flags & XFS_TRANS_ABORT)) {
@@ -1182,7 +1170,7 @@ xfs_trans_cancel(
 	}
 
 	/* mark this thread as no longer being in a transaction */
-        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
 	xfs_trans_free_items(tp, flags);
 	xfs_trans_free_busy(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 100d9a4b38ee..cb65c3a603f5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -805,12 +805,9 @@ typedef struct xfs_trans {
 	((mp)->m_sb.sb_inodesize + \
 	 (mp)->m_sb.sb_sectsize * 2 + \
 	 (mp)->m_dirblksize + \
-	 (XFS_DIR_IS_V1(mp) ? 0 : \
-	    XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \
+	 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
 	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * (4 + \
-		 (XFS_DIR_IS_V1(mp) ? 0 : \
-			 XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
+	 (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
 		 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 
 #define	XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 19ab24af1c1c..558c87ff0c41 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -22,7 +22,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
@@ -363,9 +362,10 @@ xfs_trans_delete_ail(
 			AIL_UNLOCK(mp, s);
 		else {
 			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-				"xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
+		"%s: attempting to delete a log item that is not in the AIL",
+					__FUNCTION__);
 			AIL_UNLOCK(mp, s);
-			xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		}
 	}
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c74c31ebc81c..60b6b898022b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -320,7 +318,7 @@ xfs_trans_read_buf(
 			if (xfs_error_target == target) {
 				if (((xfs_req_num++) % xfs_error_mod) == 0) {
 					xfs_buf_relse(bp);
-					printk("Returning error!\n");
+					cmn_err(CE_DEBUG, "Returning error!\n");
 					return XFS_ERROR(EIO);
 				}
 			}
@@ -369,7 +367,7 @@ xfs_trans_read_buf(
 				 */
 				if (tp->t_flags & XFS_TRANS_DIRTY)
 					xfs_force_shutdown(tp->t_mountp,
-							   XFS_METADATA_IO_ERROR);
+							SHUTDOWN_META_IO_ERROR);
 				return error;
 			}
 		}
@@ -414,7 +412,7 @@ xfs_trans_read_buf(
 		xfs_ioerror_alert("xfs_trans_read_buf", mp,
 				  bp, blkno);
 		if (tp->t_flags & XFS_TRANS_DIRTY)
-			xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+			xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
 		xfs_buf_relse(bp);
 		return error;
 	}
@@ -423,9 +421,9 @@ xfs_trans_read_buf(
 		if (xfs_error_target == target) {
 			if (((xfs_req_num++) % xfs_error_mod) == 0) {
 				xfs_force_shutdown(tp->t_mountp,
-						   XFS_METADATA_IO_ERROR);
+						   SHUTDOWN_META_IO_ERROR);
 				xfs_buf_relse(bp);
-				printk("Returning error in trans!\n");
+				cmn_err(CE_DEBUG, "Returning trans error!\n");
 				return XFS_ERROR(EIO);
 			}
 		}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 7d7d627f25df..b290270dd4a6 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -22,7 +22,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 7c5894d59f81..b8db1d5cde5a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 1117d600d741..2912aac07c7b 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -493,7 +493,7 @@ xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
 				break;
 			} else {
 				/* out-of-order vacancy */
-				printk("OOO vacancy lbcp 0x%p\n", lbcp);
+				cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
 				ASSERT(0);
 			}
 		}
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7fe3792b18df..4ea2e5074bdd 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -30,8 +30,7 @@
 	  XFS_EXTENTADD_SPACE_RES(mp,w))
 #define	XFS_DAENTER_1B(mp,w)	((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
 #define	XFS_DAENTER_DBS(mp,w)	\
-	(XFS_DA_NODE_MAXDEPTH + \
-	 ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0))
+	(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
 #define	XFS_DAENTER_BLOCKS(mp,w)	\
 	(XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
 #define	XFS_DAENTER_BMAP1B(mp,w)	\
@@ -41,10 +40,7 @@
 #define	XFS_DAENTER_SPACE_RES(mp,w)	\
 	(XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
 #define	XFS_DAREMOVE_SPACE_RES(mp,w)	XFS_DAENTER_BMAPS(mp,w)
-#define	XFS_DIRENTER_MAX_SPLIT(mp,nl)	\
-	(((mp)->m_sb.sb_blocksize == 512 && \
-	  XFS_DIR_IS_V1(mp) && \
-	  (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1)
+#define	XFS_DIRENTER_MAX_SPLIT(mp,nl)	1
 #define	XFS_DIRENTER_SPACE_RES(mp,nl)	\
 	(XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
 	 XFS_DIRENTER_MAX_SPLIT(mp,nl))
@@ -57,8 +53,7 @@
  * Space reservation values for various transactions.
  */
 #define	XFS_ADDAFORK_SPACE_RES(mp)	\
-	((mp)->m_dirblkfsbs + \
-	 (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)))
+	((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
 #define	XFS_ATTRRM_SPACE_RES(mp)	\
 	XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
 /* This macro is not used - see inline code in xfs_attr_set */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 34654ec6ae10..9014d7e44488 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -24,12 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -51,10 +49,10 @@
  */
 int
 xfs_get_dir_entry(
-	vname_t		*dentry,
+	bhv_vname_t	*dentry,
 	xfs_inode_t	**ipp)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = VNAME_TO_VNODE(dentry);
 
@@ -69,11 +67,11 @@ int
 xfs_dir_lookup_int(
 	bhv_desc_t	*dir_bdp,
 	uint		lock_mode,
-	vname_t		*dentry,
+	bhv_vname_t	*dentry,
 	xfs_ino_t	*inum,
 	xfs_inode_t	**ipp)
 {
-	vnode_t		*dir_vp;
+	bhv_vnode_t	*dir_vp;
 	xfs_inode_t	*dp;
 	int		error;
 
@@ -82,8 +80,7 @@ xfs_dir_lookup_int(
 
 	dp = XFS_BHVTOI(dir_bdp);
 
-	error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp,
-				VNAME(dentry), VNAMELEN(dentry), inum);
+	error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
 	if (!error) {
 		/*
 		 * Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index 472661a3b6d8..fe953e98afa7 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -23,9 +23,10 @@
 #define	ITRACE(ip)	vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
 				(inst_t *)__return_address)
 
-extern int xfs_rename (bhv_desc_t *, vname_t *, vnode_t *, vname_t *, cred_t *);
-extern int xfs_get_dir_entry (vname_t *, xfs_inode_t **);
-extern int xfs_dir_lookup_int (bhv_desc_t *, uint, vname_t *, xfs_ino_t *,
+extern int xfs_rename (bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *,
+			bhv_vname_t *, cred_t *);
+extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
+extern int xfs_dir_lookup_int (bhv_desc_t *, uint, bhv_vname_t *, xfs_ino_t *,
 				xfs_inode_t **);
 extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 36ea1b2094f2..6c96391f3f1a 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -131,9 +129,6 @@ xfs_init(void)
 #ifdef XFS_BMBT_TRACE
 	xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
 #endif
-#ifdef XFS_DIR_TRACE
-	xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP);
-#endif
 #ifdef XFS_ATTR_TRACE
 	xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
 #endif
@@ -177,9 +172,6 @@ xfs_cleanup(void)
 #ifdef XFS_ATTR_TRACE
 	ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_DIR_TRACE
-	ktrace_free(xfs_dir_trace_buf);
-#endif
 #ifdef XFS_BMBT_TRACE
 	ktrace_free(xfs_bmbt_trace_buf);
 #endif
@@ -212,7 +204,7 @@ xfs_cleanup(void)
  */
 STATIC int
 xfs_start_flags(
-	struct vfs		*vfs,
+	struct bhv_vfs		*vfs,
 	struct xfs_mount_args	*ap,
 	struct xfs_mount	*mp)
 {
@@ -337,7 +329,7 @@ xfs_start_flags(
  */
 STATIC int
 xfs_finish_flags(
-	struct vfs		*vfs,
+	struct bhv_vfs		*vfs,
 	struct xfs_mount_args	*ap,
 	struct xfs_mount	*mp)
 {
@@ -423,7 +415,7 @@ xfs_mount(
 	struct xfs_mount_args	*args,
 	cred_t			*credp)
 {
-	struct vfs		*vfsp = bhvtovfs(bhvp);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhvp);
 	struct bhv_desc		*p;
 	struct xfs_mount	*mp = XFS_BHVTOM(bhvp);
 	struct block_device	*ddev, *logdev, *rtdev;
@@ -552,10 +544,10 @@ xfs_unmount(
 	int		flags,
 	cred_t		*credp)
 {
-	struct vfs	*vfsp = bhvtovfs(bdp);
+	bhv_vfs_t	*vfsp = bhvtovfs(bdp);
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
 	xfs_inode_t	*rip;
-	vnode_t		*rvp;
+	bhv_vnode_t	*rvp;
 	int		unmount_event_wanted = 0;
 	int		unmount_event_flags = 0;
 	int		xfs_unmountfs_needed = 0;
@@ -665,9 +657,8 @@ xfs_mntupdate(
 	int				*flags,
 	struct xfs_mount_args		*args)
 {
-	struct vfs	*vfsp = bhvtovfs(bdp);
+	bhv_vfs_t	*vfsp = bhvtovfs(bdp);
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
-	int		error;
 
 	if (!(*flags & MS_RDONLY)) {			/* rw/ro -> rw */
 		if (vfsp->vfs_flag & VFS_RDONLY)
@@ -679,7 +670,7 @@ xfs_mntupdate(
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		}
 	} else if (!(vfsp->vfs_flag & VFS_RDONLY)) {	/* rw -> ro */
-		VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+		bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL);
 		xfs_quiesce_fs(mp);
 		xfs_log_unmount_write(mp);
 		xfs_unmountfs_writesb(mp);
@@ -702,7 +693,7 @@ xfs_unmount_flush(
 	xfs_inode_t	*rip = mp->m_rootip;
 	xfs_inode_t	*rbmip;
 	xfs_inode_t	*rsumip = NULL;
-	vnode_t		*rvp = XFS_ITOV(rip);
+	bhv_vnode_t	*rvp = XFS_ITOV(rip);
 	int		error;
 
 	xfs_ilock(rip, XFS_ILOCK_EXCL);
@@ -781,9 +772,9 @@ fscorrupt_out2:
 STATIC int
 xfs_root(
 	bhv_desc_t	*bdp,
-	vnode_t		**vpp)
+	bhv_vnode_t	**vpp)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip);
 	VN_HOLD(vp);
@@ -801,8 +792,8 @@ xfs_root(
 STATIC int
 xfs_statvfs(
 	bhv_desc_t	*bdp,
-	xfs_statfs_t	*statp,
-	vnode_t		*vp)
+	bhv_statvfs_t	*statp,
+	bhv_vnode_t	*vp)
 {
 	__uint64_t	fakeinos;
 	xfs_extlen_t	lsize;
@@ -900,7 +891,7 @@ xfs_sync(
 /*
  * xfs sync routine for internal use
  *
- * This routine supports all of the flags defined for the generic VFS_SYNC
+ * This routine supports all of the flags defined for the generic vfs_sync
  * interface as explained above under xfs_sync.  In the interests of not
  * changing interfaces within the 6.5 family, additional internally-
  * required functions are specified within a separate xflags parameter,
@@ -917,7 +908,7 @@ xfs_sync_inodes(
 	xfs_inode_t	*ip = NULL;
 	xfs_inode_t	*ip_next;
 	xfs_buf_t	*bp;
-	vnode_t		*vp = NULL;
+	bhv_vnode_t	*vp = NULL;
 	int		error;
 	int		last_error;
 	uint64_t	fflag;
@@ -1156,9 +1147,9 @@ xfs_sync_inodes(
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 			if (XFS_FORCED_SHUTDOWN(mp)) {
-				VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+				bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
 			} else {
-				VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF);
+				bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF);
 			}
 
 			xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1178,8 +1169,8 @@ xfs_sync_inodes(
 				 * across calls to the buffer cache.
 				 */
 				xfs_iunlock(ip, XFS_ILOCK_SHARED);
-				VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1,
-							fflag, FI_NONE, error);
+				error = bhv_vop_flush_pages(vp, (xfs_off_t)0,
+							-1, fflag, FI_NONE);
 				xfs_ilock(ip, XFS_ILOCK_SHARED);
 			}
 
@@ -1231,9 +1222,7 @@ xfs_sync_inodes(
 						 * marker and free it.
 						 */
 						XFS_MOUNT_ILOCK(mp);
-
 						IPOINTER_REMOVE(ip, mp);
-
 						XFS_MOUNT_IUNLOCK(mp);
 
 						ASSERT(!(lock_flags &
@@ -1421,7 +1410,7 @@ xfs_sync_inodes(
 /*
  * xfs sync routine for internal use
  *
- * This routine supports all of the flags defined for the generic VFS_SYNC
+ * This routine supports all of the flags defined for the generic vfs_sync
  * interface as explained above under xfs_sync.  In the interests of not
  * changing interfaces within the 6.5 family, additional internally-
  * required functions are specified within a separate xflags parameter,
@@ -1574,7 +1563,7 @@ xfs_syncsub(
 STATIC int
 xfs_vget(
 	bhv_desc_t	*bdp,
-	vnode_t		**vpp,
+	bhv_vnode_t	**vpp,
 	fid_t		*fidp)
 {
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
@@ -1657,10 +1646,10 @@ xfs_vget(
 #define MNTOPT_NOATTR2	"noattr2"	/* do not use attr2 attribute format */
 
 STATIC unsigned long
-suffix_strtoul(const char *cp, char **endp, unsigned int base)
+suffix_strtoul(char *s, char **endp, unsigned int base)
 {
 	int	last, shift_left_factor = 0;
-	char	*value = (char *)cp;
+	char	*value = s;
 
 	last = strlen(value) - 1;
 	if (value[last] == 'K' || value[last] == 'k') {
@@ -1676,7 +1665,7 @@ suffix_strtoul(const char *cp, char **endp, unsigned int base)
 		value[last] = '\0';
 	}
 
-	return simple_strtoul(cp, endp, base) << shift_left_factor;
+	return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
 
 STATIC int
@@ -1686,7 +1675,7 @@ xfs_parseargs(
 	struct xfs_mount_args	*args,
 	int			update)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	bhv_vfs_t		*vfsp = bhvtovfs(bhv);
 	char			*this_char, *value, *eov;
 	int			dsunit, dswidth, vol_dsunit, vol_dswidth;
 	int			iosize;
@@ -1708,42 +1697,48 @@ xfs_parseargs(
 
 		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			args->logbufs = simple_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			args->logbufsize = suffix_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			strncpy(args->logname, value, MAXNAMELEN);
 		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			strncpy(args->mtpt, value, MAXNAMELEN);
 		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			strncpy(args->rtname, value, MAXNAMELEN);
 		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1752,7 +1747,8 @@ xfs_parseargs(
 			args->iosizelog = (uint8_t) iosize;
 		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1761,7 +1757,8 @@ xfs_parseargs(
 			args->iosizelog = ffs(iosize) - 1;
 		} else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1782,7 +1779,8 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_INO64)) {
 			args->flags |= XFSMNT_INO64;
 #if !XFS_BIG_INUMS
-			printk("XFS: %s option not allowed on this system\n",
+			cmn_err(CE_WARN,
+				"XFS: %s option not allowed on this system",
 				this_char);
 			return EINVAL;
 #endif
@@ -1792,14 +1790,16 @@ xfs_parseargs(
 			args->flags |= XFSMNT_SWALLOC;
 		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			dsunit = simple_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1807,7 +1807,8 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
 			args->flags &= ~XFSMNT_32BITINODES;
 #if !XFS_BIG_INUMS
-			printk("XFS: %s option not allowed on this system\n",
+			cmn_err(CE_WARN,
+				"XFS: %s option not allowed on this system",
 				this_char);
 			return EINVAL;
 #endif
@@ -1831,36 +1832,41 @@ xfs_parseargs(
 			args->flags &= ~XFSMNT_ATTR2;
 		} else if (!strcmp(this_char, "osyncisdsync")) {
 			/* no-op, this is now the default */
-printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
+			cmn_err(CE_WARN,
+	"XFS: osyncisdsync is now the default, option is deprecated.");
 		} else if (!strcmp(this_char, "irixsgid")) {
-printk("XFS: irixsgid is now a sysctl(2) variable, option is deprecated.\n");
+			cmn_err(CE_WARN,
+	"XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
 		} else {
-			printk("XFS: unknown mount option [%s].\n", this_char);
+			cmn_err(CE_WARN,
+				"XFS: unknown mount option [%s].", this_char);
 			return EINVAL;
 		}
 	}
 
 	if (args->flags & XFSMNT_NORECOVERY) {
 		if ((vfsp->vfs_flag & VFS_RDONLY) == 0) {
-			printk("XFS: no-recovery mounts must be read-only.\n");
+			cmn_err(CE_WARN,
+				"XFS: no-recovery mounts must be read-only.");
 			return EINVAL;
 		}
 	}
 
 	if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
-		printk(
-	"XFS: sunit and swidth options incompatible with the noalign option\n");
+		cmn_err(CE_WARN,
+	"XFS: sunit and swidth options incompatible with the noalign option");
 		return EINVAL;
 	}
 
 	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-		printk("XFS: sunit and swidth must be specified together\n");
+		cmn_err(CE_WARN,
+			"XFS: sunit and swidth must be specified together");
 		return EINVAL;
 	}
 
 	if (dsunit && (dswidth % dsunit != 0)) {
-		printk(
-	"XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n",
+		cmn_err(CE_WARN,
+	"XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
 			dswidth, dsunit);
 		return EINVAL;
 	}
@@ -1907,7 +1913,7 @@ xfs_showargs(
 	};
 	struct proc_xfs_info	*xfs_infop;
 	struct xfs_mount	*mp = XFS_BHVTOM(bhv);
-	struct vfs		*vfsp = XFS_MTOVFS(mp);
+	struct bhv_vfs		*vfsp = XFS_MTOVFS(mp);
 
 	for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
 		if (mp->m_flags & xfs_infop->flag)
@@ -1967,7 +1973,7 @@ xfs_freeze(
 }
 
 
-vfsops_t xfs_vfsops = {
+bhv_vfsops_t xfs_vfsops = {
 	BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS),
 	.vfs_parseargs		= xfs_parseargs,
 	.vfs_showargs		= xfs_showargs,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7027ae68ee38..00a6b7dc24a0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -16,8 +16,6 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-#include <linux/capability.h>
-
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
@@ -27,7 +25,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -35,13 +32,11 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_itable.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
@@ -58,32 +53,14 @@
 #include "xfs_log_priv.h"
 #include "xfs_mac.h"
 
-
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 2 extents back from
- * bmapi.
- */
-#define SYMLINK_MAPS 2
-
-/*
- * For xfs, we check that the file isn't too big to be opened by this kernel.
- * No other open action is required for regular files.  Devices are handled
- * through the specfs file system, pipes through fifofs.  Device and
- * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
- * when a new vnode is first looked up or created.
- */
 STATIC int
 xfs_open(
 	bhv_desc_t	*bdp,
 	cred_t		*credp)
 {
 	int		mode;
-	vnode_t		*vp;
-	xfs_inode_t	*ip;
-
-	vp = BHV_TO_VNODE(bdp);
-	ip = XFS_BHVTOI(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
+	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
@@ -101,6 +78,35 @@ xfs_open(
 	return 0;
 }
 
+STATIC int
+xfs_close(
+	bhv_desc_t	*bdp,
+	int		flags,
+	lastclose_t	lastclose,
+	cred_t		*credp)
+{
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
+	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+
+	if (lastclose != L_TRUE || !VN_ISREG(vp))
+		return 0;
+
+	/*
+	 * If we previously truncated this file and removed old data in
+	 * the process, we want to initiate "early" writeout on the last
+	 * close.  This is an attempt to combat the notorious NULL files
+	 * problem which is particularly noticable from a truncate down,
+	 * buffered (re-)write (delalloc), followed by a crash.  What we
+	 * are effectively doing here is significantly reducing the time
+	 * window where we'd otherwise be exposed to that problem.
+	 */
+	if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+		return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
+	return 0;
+}
 
 /*
  * xfs_getattr
@@ -108,13 +114,13 @@ xfs_open(
 STATIC int
 xfs_getattr(
 	bhv_desc_t	*bdp,
-	vattr_t		*vap,
+	bhv_vattr_t	*vap,
 	int		flags,
 	cred_t		*credp)
 {
 	xfs_inode_t	*ip;
 	xfs_mount_t	*mp;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp  = BHV_TO_VNODE(bdp);
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
@@ -241,7 +247,7 @@ xfs_getattr(
 int
 xfs_setattr(
 	bhv_desc_t		*bdp,
-	vattr_t			*vap,
+	bhv_vattr_t		*vap,
 	int			flags,
 	cred_t			*credp)
 {
@@ -255,7 +261,7 @@ xfs_setattr(
 	uid_t			uid=0, iuid=0;
 	gid_t			gid=0, igid=0;
 	int			timeflags = 0;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	xfs_prid_t		projid=0, iprojid=0;
 	int			mandlock_before, mandlock_after;
 	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
@@ -347,7 +353,6 @@ xfs_setattr(
 	 */
 	tp = NULL;
 	lock_flags = XFS_ILOCK_EXCL;
-	ASSERT(flags & ATTR_NOLOCK ? flags & ATTR_DMI : 1);
 	if (flags & ATTR_NOLOCK)
 		need_iolock = 0;
 	if (!(mask & XFS_AT_SIZE)) {
@@ -666,9 +671,17 @@ xfs_setattr(
 					    ((ip->i_d.di_nlink != 0 ||
 					      !(mp->m_flags & XFS_MOUNT_WSYNC))
 					     ? 1 : 0));
-			if (code) {
+			if (code)
 				goto abort_return;
-			}
+			/*
+			 * Truncated "down", so we're removing references
+			 * to old data here - if we now delay flushing for
+			 * a long time, we expose ourselves unduly to the
+			 * notorious NULL files problem.  So, we mark this
+			 * vnode and flush it when the file is closed, and
+			 * do not wait the usual (long) time for writeout.
+			 */
+			VTRUNCATE(vp);
 		}
 		/*
 		 * Have to do this even if the file's size doesn't change.
@@ -800,6 +813,8 @@ xfs_setattr(
 				di_flags |= XFS_DIFLAG_NODUMP;
 			if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 				di_flags |= XFS_DIFLAG_PROJINHERIT;
+			if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
+				di_flags |= XFS_DIFLAG_NODEFRAG;
 			if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 				if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 					di_flags |= XFS_DIFLAG_RTINHERIT;
@@ -869,7 +884,7 @@ xfs_setattr(
 	 */
 	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 	if (mandlock_before != mandlock_after) {
-		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
+		bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
 				 mandlock_after);
 	}
 
@@ -936,6 +951,13 @@ xfs_access(
 
 
 /*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+
+/*
  * xfs_readlink
  *
  */
@@ -950,7 +972,7 @@ xfs_readlink(
 	int		count;
 	xfs_off_t	offset;
 	int		pathlen;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	int		error = 0;
 	xfs_mount_t	*mp;
 	int             nmaps;
@@ -1000,7 +1022,7 @@ xfs_readlink(
 		nmaps = SYMLINK_MAPS;
 
 		error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
-				  0, NULL, 0, mval, &nmaps, NULL);
+				  0, NULL, 0, mval, &nmaps, NULL, NULL);
 
 		if (error) {
 			goto error_return;
@@ -1208,8 +1230,8 @@ xfs_inactive_free_eofblocks(
 
 	nimaps = 1;
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
-			  NULL, 0, &imap, &nimaps, NULL);
+	error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
+			  NULL, 0, &imap, &nimaps, NULL, NULL);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (!error && (nimaps != 0) &&
@@ -1338,7 +1360,7 @@ xfs_inactive_symlink_rmt(
 	nmaps = ARRAY_SIZE(mval);
 	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
 			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-			&free_list)))
+			&free_list, NULL)))
 		goto error0;
 	/*
 	 * Invalidate the block(s).
@@ -1353,7 +1375,7 @@ xfs_inactive_symlink_rmt(
 	 * Unmap the dead block(s) to the free_list.
 	 */
 	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-			&first_block, &free_list, &done)))
+			&first_block, &free_list, NULL, &done)))
 		goto error1;
 	ASSERT(done);
 	/*
@@ -1469,9 +1491,6 @@ xfs_inactive_symlink_local(
 	return 0;
 }
 
-/*
- *
- */
 STATIC int
 xfs_inactive_attrs(
 	xfs_inode_t	*ip,
@@ -1524,16 +1543,16 @@ xfs_release(
 	bhv_desc_t	*bdp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	xfs_mount_t	*mp;
 	int		error;
 
 	vp = BHV_TO_VNODE(bdp);
 	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
 
-	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
+	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
 		return 0;
-	}
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
 	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
@@ -1545,8 +1564,6 @@ xfs_release(
 		return 0;
 #endif
 
-	mp = ip->i_mount;
-
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
 		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -1579,8 +1596,8 @@ xfs_inactive(
 	cred_t		*credp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
-	xfs_bmap_free_t	free_list; 
+	bhv_vnode_t	*vp;
+	xfs_bmap_free_t	free_list;
 	xfs_fsblock_t	first_block;
 	int		committed;
 	xfs_trans_t	*tp;
@@ -1760,7 +1777,7 @@ xfs_inactive(
 			cmn_err(CE_NOTE,
 		"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
 				error, mp->m_fsname);
-			xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 		}
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 	} else {
@@ -1795,17 +1812,17 @@ xfs_inactive(
 STATIC int
 xfs_lookup(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
-	vnode_t			**vpp,
+	bhv_vname_t		*dentry,
+	bhv_vnode_t		**vpp,
 	int			flags,
-	vnode_t			*rdir,
+	bhv_vnode_t		*rdir,
 	cred_t			*credp)
 {
 	xfs_inode_t		*dp, *ip;
 	xfs_ino_t		e_inum;
 	int			error;
 	uint			lock_mode;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 
 	dir_vp = BHV_TO_VNODE(dir_bdp);
 	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
@@ -1832,15 +1849,15 @@ xfs_lookup(
 STATIC int
 xfs_create(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
-	vattr_t			*vap,
-	vnode_t			**vpp,
+	bhv_vname_t		*dentry,
+	bhv_vattr_t		*vap,
+	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
 	char			*name = VNAME(dentry);
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	xfs_inode_t		*dp, *ip;
-	vnode_t		        *vp=NULL;
+	bhv_vnode_t	        *vp = NULL;
 	xfs_trans_t		*tp;
 	xfs_mount_t	        *mp;
 	xfs_dev_t		rdev;
@@ -1938,8 +1955,7 @@ xfs_create(
 	if (error)
 		goto error_return;
 
-	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
+	if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
 		goto error_return;
 	rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
 	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
@@ -1970,9 +1986,9 @@ xfs_create(
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	dp_joined_to_trans = B_TRUE;
 
-	error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
-		&first_block, &free_list,
-		resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
+					&first_block, &free_list, resblks ?
+					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != ENOSPC);
 		goto abort_return;
@@ -2026,7 +2042,7 @@ xfs_create(
 	 * Propagate the fact that the vnode changed after the
 	 * xfs_inode locks have been released.
 	 */
-	VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
+	bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
 
 	*vpp = vp;
 
@@ -2107,7 +2123,7 @@ int xfs_rm_attempts;
 STATIC int
 xfs_lock_dir_and_entry(
 	xfs_inode_t	*dp,
-	vname_t		*dentry,
+	bhv_vname_t	*dentry,
 	xfs_inode_t	*ip)	/* inode of entry 'name' */
 {
 	int		attempts;
@@ -2321,10 +2337,10 @@ int remove_which_error_return = 0;
 STATIC int
 xfs_remove(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
+	bhv_vname_t		*dentry,
 	cred_t			*credp)
 {
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	char			*name = VNAME(dentry);
 	xfs_inode_t             *dp, *ip;
 	xfs_trans_t             *tp = NULL;
@@ -2448,8 +2464,8 @@ xfs_remove(
 	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
 	 */
 	XFS_BMAP_INIT(&free_list, &first_block);
-	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
-		&first_block, &free_list, 0);
+	error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
+					&first_block, &free_list, 0);
 	if (error) {
 		ASSERT(error != ENOENT);
 		REMOVE_DEBUG_TRACE(__LINE__);
@@ -2511,7 +2527,7 @@ xfs_remove(
 	/*
 	 * Let interposed file systems know about removed links.
 	 */
-	VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
+	bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
 
 	IRELE(ip);
 
@@ -2564,8 +2580,8 @@ xfs_remove(
 STATIC int
 xfs_link(
 	bhv_desc_t		*target_dir_bdp,
-	vnode_t			*src_vp,
-	vname_t			*dentry,
+	bhv_vnode_t		*src_vp,
+	bhv_vname_t		*dentry,
 	cred_t			*credp)
 {
 	xfs_inode_t		*tdp, *sip;
@@ -2577,7 +2593,7 @@ xfs_link(
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	vnode_t			*target_dir_vp;
+	bhv_vnode_t		*target_dir_vp;
 	int			resblks;
 	char			*target_name = VNAME(dentry);
 	int			target_namelen;
@@ -2668,13 +2684,12 @@ xfs_link(
 	}
 
 	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
-			target_namelen)))
+	    (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
 		goto error_return;
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
+	error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
 				   sip->i_ino, &first_block, &free_list,
 				   resblks);
 	if (error)
@@ -2734,15 +2749,15 @@ std_return:
 STATIC int
 xfs_mkdir(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
-	vattr_t			*vap,
-	vnode_t			**vpp,
+	bhv_vname_t		*dentry,
+	bhv_vattr_t		*vap,
+	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
 	char			*dir_name = VNAME(dentry);
 	xfs_inode_t             *dp;
 	xfs_inode_t		*cdp;	/* inode of created dir */
-	vnode_t			*cvp;	/* vnode of created dir */
+	bhv_vnode_t		*cvp;	/* vnode of created dir */
 	xfs_trans_t		*tp;
 	xfs_mount_t		*mp;
 	int			cancel_flags;
@@ -2750,7 +2765,7 @@ xfs_mkdir(
 	int			committed;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	boolean_t		dp_joined_to_trans;
 	boolean_t		created = B_FALSE;
 	int			dm_event_sent = 0;
@@ -2840,7 +2855,7 @@ xfs_mkdir(
 		goto error_return;
 
 	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
+	    (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
 		goto error_return;
 	/*
 	 * create the directory inode.
@@ -2867,9 +2882,9 @@ xfs_mkdir(
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
-			cdp->i_ino, &first_block, &free_list,
-			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
+				   &first_block, &free_list, resblks ?
+				   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != ENOSPC);
 		goto error1;
@@ -2883,16 +2898,14 @@ xfs_mkdir(
 	 */
 	dp->i_gen++;
 
-	error = XFS_DIR_INIT(mp, tp, cdp, dp);
-	if (error) {
+	error = xfs_dir_init(tp, cdp, dp);
+	if (error)
 		goto error2;
-	}
 
 	cdp->i_gen = 1;
 	error = xfs_bumplink(tp, dp);
-	if (error) {
+	if (error)
 		goto error2;
-	}
 
 	cvp = XFS_ITOV(cdp);
 
@@ -2969,7 +2982,7 @@ std_return:
 STATIC int
 xfs_rmdir(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
+	bhv_vname_t		*dentry,
 	cred_t			*credp)
 {
 	char			*name = VNAME(dentry);
@@ -2982,7 +2995,7 @@ xfs_rmdir(
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	int			dm_di_mode = 0;
 	int			last_cdp_link;
 	int			namelen;
@@ -3101,16 +3114,15 @@ xfs_rmdir(
 		error = XFS_ERROR(ENOTEMPTY);
 		goto error_return;
 	}
-	if (!XFS_DIR_ISEMPTY(mp, cdp)) {
+	if (!xfs_dir_isempty(cdp)) {
 		error = XFS_ERROR(ENOTEMPTY);
 		goto error_return;
 	}
 
-	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
-		&first_block, &free_list, resblks);
-	if (error) {
+	error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
+					&first_block, &free_list, resblks);
+	if (error)
 		goto error1;
-	}
 
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
@@ -3181,7 +3193,7 @@ xfs_rmdir(
 	/*
 	 * Let interposed file systems know about removed links.
 	 */
-	VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
+	bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
 
 	IRELE(cdp);
 
@@ -3209,8 +3221,6 @@ xfs_rmdir(
 
 
 /*
- * xfs_readdir
- *
  * Read dp's entries starting at uiop->uio_offset and translate them into
  * bufsize bytes worth of struct dirents starting at bufbase.
  */
@@ -3230,28 +3240,23 @@ xfs_readdir(
 					       (inst_t *)__return_address);
 	dp = XFS_BHVTOI(dir_bdp);
 
-	if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return XFS_ERROR(EIO);
-	}
 
 	lock_mode = xfs_ilock_map_shared(dp);
-	error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
+	error = xfs_dir_getdents(tp, dp, uiop, eofp);
 	xfs_iunlock_map_shared(dp, lock_mode);
 	return error;
 }
 
 
-/*
- * xfs_symlink
- *
- */
 STATIC int
 xfs_symlink(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
-	vattr_t			*vap,
+	bhv_vname_t		*dentry,
+	bhv_vattr_t		*vap,
 	char			*target_path,
-	vnode_t			**vpp,
+	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
 	xfs_trans_t		*tp;
@@ -3263,7 +3268,7 @@ xfs_symlink(
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	boolean_t		dp_joined_to_trans;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	uint			cancel_flags;
 	int			committed;
 	xfs_fileoff_t		first_fsb;
@@ -3308,7 +3313,7 @@ xfs_symlink(
 		int len, total;
 		char *path;
 
-		for(total = 0, path = target_path; total < pathlen;) {
+		for (total = 0, path = target_path; total < pathlen;) {
 			/*
 			 * Skip any slashes.
 			 */
@@ -3402,7 +3407,7 @@ xfs_symlink(
 	 * Check for ability to enter directory entry, if no space reserved.
 	 */
 	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
+	    (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
 		goto error_return;
 	/*
 	 * Initialize the bmap freelist prior to calling either
@@ -3457,7 +3462,7 @@ xfs_symlink(
 		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
 				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
 				  &first_block, resblks, mval, &nmaps,
-				  &free_list);
+				  &free_list, NULL);
 		if (error) {
 			goto error1;
 		}
@@ -3489,11 +3494,10 @@ xfs_symlink(
 	/*
 	 * Create the directory entry for the symlink.
 	 */
-	error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
-			ip->i_ino, &first_block, &free_list, resblks);
-	if (error) {
+	error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
+				   &first_block, &free_list, resblks);
+	if (error)
 		goto error1;
-	}
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
@@ -3541,7 +3545,7 @@ std_return:
 	}
 
 	if (!error) {
-		vnode_t *vp;
+		bhv_vnode_t *vp;
 
 		ASSERT(ip);
 		vp = XFS_ITOV(ip);
@@ -3606,10 +3610,10 @@ xfs_fid2(
 int
 xfs_rwlock(
 	bhv_desc_t	*bdp,
-	vrwlock_t	locktype)
+	bhv_vrwlock_t	locktype)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	if (VN_ISDIR(vp))
@@ -3637,10 +3641,10 @@ xfs_rwlock(
 void
 xfs_rwunlock(
 	bhv_desc_t	*bdp,
-	vrwlock_t	locktype)
+	bhv_vrwlock_t	locktype)
 {
 	xfs_inode_t     *ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	if (VN_ISDIR(vp))
@@ -3744,7 +3748,6 @@ xfs_inode_flush(
 	return error;
 }
 
-
 int
 xfs_set_dmattrs (
 	bhv_desc_t	*bdp,
@@ -3785,16 +3788,12 @@ xfs_set_dmattrs (
 	return error;
 }
 
-
-/*
- * xfs_reclaim
- */
 STATIC int
 xfs_reclaim(
 	bhv_desc_t	*bdp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	ip = XFS_BHVTOI(bdp);
@@ -3849,7 +3848,7 @@ xfs_finish_reclaim(
 	int		sync_mode)
 {
 	xfs_ihash_t	*ih = ip->i_hash;
-	vnode_t		*vp = XFS_ITOV_NULL(ip);
+	bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
 	int		error;
 
 	if (vp && VN_BAD(vp))
@@ -4116,10 +4115,10 @@ retry:
 		 * Issue the xfs_bmapi() call to allocate the blocks
 		 */
 		XFS_BMAP_INIT(&free_list, &firstfsb);
-		error = xfs_bmapi(tp, ip, startoffset_fsb,
+		error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
 				  allocatesize_fsb, bmapi_flag,
 				  &firstfsb, 0, imapp, &nimaps,
-				  &free_list);
+				  &free_list, NULL);
 		if (error) {
 			goto error0;
 		}
@@ -4199,8 +4198,8 @@ xfs_zero_remaining_bytes(
 	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
 		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
-			&nimap, NULL);
+		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
+			NULL, 0, &imap, &nimap, NULL, NULL);
 		if (error || nimap < 1)
 			break;
 		ASSERT(imap.br_blockcount >= 1);
@@ -4259,7 +4258,7 @@ xfs_free_file_space(
 	xfs_off_t		len,
 	int			attr_flags)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	int			committed;
 	int			done;
 	xfs_off_t		end_dmi_offset;
@@ -4308,7 +4307,6 @@ xfs_free_file_space(
 			return error;
 	}
 
-	ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
 	if (attr_flags & ATTR_NOLOCK)
 		need_iolock = 0;
 	if (need_iolock) {
@@ -4326,7 +4324,7 @@ xfs_free_file_space(
 	if (VN_CACHED(vp) != 0) {
 		xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
 				ctooff(offtoct(ioffset)), -1);
-		VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)),
+		bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
 				-1, FI_REMAPF_LOCKED);
 	}
 
@@ -4338,8 +4336,8 @@ xfs_free_file_space(
 	 */
 	if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
-			&imap, &nimap, NULL);
+		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
+			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -4353,8 +4351,8 @@ xfs_free_file_space(
 				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
 		}
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
-			&imap, &nimap, NULL);
+		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
+			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -4426,9 +4424,9 @@ xfs_free_file_space(
 		 * issue the bunmapi() call to free the blocks
 		 */
 		XFS_BMAP_INIT(&free_list, &firstfsb);
-		error = xfs_bunmapi(tp, ip, startoffset_fsb,
+		error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
 				  endoffset_fsb - startoffset_fsb,
-				  0, 2, &firstfsb, &free_list, &done);
+				  0, 2, &firstfsb, &free_list, NULL, &done);
 		if (error) {
 			goto error0;
 		}
@@ -4488,8 +4486,8 @@ xfs_change_file_space(
 	xfs_off_t	startoffset;
 	xfs_off_t	llen;
 	xfs_trans_t	*tp;
-	vattr_t		va;
-	vnode_t		*vp;
+	bhv_vattr_t	va;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
@@ -4642,9 +4640,10 @@ xfs_change_file_space(
 	return error;
 }
 
-vnodeops_t xfs_vnodeops = {
+bhv_vnodeops_t xfs_vnodeops = {
 	BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
 	.vop_open		= xfs_open,
+	.vop_close		= xfs_close,
 	.vop_read		= xfs_read,
 #ifdef HAVE_SENDFILE
 	.vop_sendfile		= xfs_sendfile,
author	Dmitry Torokhov <dtor_core@ameritech.net>	2006-06-26 01:31:38 -0400
committer	Dmitry Torokhov <dtor_core@ameritech.net>	2006-06-26 01:31:38 -0400
commit	4854c7b27f0975a2b629f35ea3996d2968eb7c4f (patch)
tree	4102bdb70289764a2058aff0f907b13d7cf0e0d1 /fs
parent	3cbd5b32cb625f5c0f1b1476d154fac873dd49ce (diff)
parent	fcc18e83e1f6fd9fa6b333735bf0fcd530655511 (diff)
download	linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.tar.gz linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.tar.bz2 linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.zip